diff --git a/externals/clang/4.0.0/linux32/bin/clang b/externals/clang/4.0.0/linux32/bin/clang
new file mode 100755
index 000000000..deadf77af
Binary files /dev/null and b/externals/clang/4.0.0/linux32/bin/clang differ
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_builtin_vars.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_builtin_vars.h
new file mode 100644
index 000000000..6f5eb9c78
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_builtin_vars.h
@@ -0,0 +1,126 @@
+/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_BUILTIN_VARS_H
+#define __CUDA_BUILTIN_VARS_H
+
+// Forward declares from vector_types.h.
+struct uint3;
+struct dim3;
+
+// The file implements built-in CUDA variables using __declspec(property).
+// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
+// All read accesses of built-in variable fields get converted into calls to a
+// getter function which in turn calls the appropriate builtin to fetch the
+// value.
+//
+// Example:
+//    int x = threadIdx.x;
+// IR output:
+//  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
+// PTX output:
+//  mov.u32     %r2, %tid.x;
+
+#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC)                                \
+  __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD;      \
+  static inline __attribute__((always_inline))                                 \
+      __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) {     \
+    return INTRINSIC;                                                          \
+  }
+
+#if __cplusplus >= 201103L
+#define __DELETE =delete
+#else
+#define __DELETE
+#endif
+
+// Make sure nobody can create instances of the special varible types.  nvcc
+// also disallows taking address of special variables, so we disable address-of
+// operator as well.
+#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
+  __attribute__((device)) TypeName() __DELETE;                                 \
+  __attribute__((device)) TypeName(const TypeName &) __DELETE;                 \
+  __attribute__((device)) void operator=(const TypeName &) const __DELETE;     \
+  __attribute__((device)) TypeName *operator&() const __DELETE
+
+struct __cuda_builtin_threadIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
+  // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
+};
+
+struct __cuda_builtin_blockIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
+  // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
+};
+
+struct __cuda_builtin_blockDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
+  // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
+};
+
+struct __cuda_builtin_gridDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
+  // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
+};
+
+#define __CUDA_BUILTIN_VAR                                                     \
+  extern const __attribute__((device)) __attribute__((weak))
+__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
+__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
+
+// warpSize should translate to read of %WARP_SZ but there's currently no
+// builtin to do so. According to PTX v4.2 docs 'to date, all target
+// architectures have a WARP_SZ value of 32'.
+__attribute__((device)) const int warpSize = 32;
+
+#undef __CUDA_DEVICE_BUILTIN
+#undef __CUDA_BUILTIN_VAR
+#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+
+#endif /* __CUDA_BUILTIN_VARS_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_cmath.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_cmath.h
new file mode 100644
index 000000000..9bef82611
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_cmath.h
@@ -0,0 +1,487 @@
+/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_CMATH_H__
+#define __CLANG_CUDA_CMATH_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+#include <limits>
+
+// CUDA lets us use various std math functions on the device side.  This file
+// works in concert with __clang_cuda_math_forward_declares.h to make this work.
+//
+// Specifically, the forward-declares header declares __device__ overloads for
+// these functions in the global namespace, then pulls them into namespace std
+// with 'using' statements.  Then this file implements those functions, after
+// their implementations have been pulled in.
+//
+// It's important that we declare the functions in the global namespace and pull
+// them into namespace std with using statements, as opposed to simply declaring
+// these functions in namespace std, because our device functions need to
+// overload the standard library functions, which may be declared in the global
+// namespace or in std, depending on the degree of conformance of the stdlib
+// implementation.  Declaring in the global namespace and pulling into namespace
+// std covers all of the known knowns.
+
+#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
+
+__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ float acos(float __x) { return ::acosf(__x); }
+__DEVICE__ float asin(float __x) { return ::asinf(__x); }
+__DEVICE__ float atan(float __x) { return ::atanf(__x); }
+__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
+__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
+__DEVICE__ float cos(float __x) { return ::cosf(__x); }
+__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
+__DEVICE__ float exp(float __x) { return ::expf(__x); }
+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
+__DEVICE__ float floor(float __x) { return ::floorf(__x); }
+__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
+__DEVICE__ int fpclassify(float __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ int fpclassify(double __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ float frexp(float __arg, int *__exp) {
+  return ::frexpf(__arg, __exp);
+}
+
+// For inscrutable reasons, the CUDA headers define these functions for us on
+// Windows.
+#ifndef _MSC_VER
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
+// For inscrutable reasons, __finite(), the double-precision version of
+// __finitef, does not exist when compiling for MacOS.  __isfinited is available
+// everywhere and is just as good.
+__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+#endif
+
+__DEVICE__ bool isgreater(float __x, float __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreater(double __x, double __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(float __x, float __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(double __x, double __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isless(float __x, float __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool isless(double __x, double __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool islessequal(float __x, float __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessequal(double __x, double __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessgreater(float __x, float __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool islessgreater(double __x, double __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isunordered(float __x, float __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ bool isunordered(double __x, double __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ float ldexp(float __arg, int __exp) {
+  return ::ldexpf(__arg, __exp);
+}
+__DEVICE__ float log(float __x) { return ::logf(__x); }
+__DEVICE__ float log10(float __x) { return ::log10f(__x); }
+__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
+__DEVICE__ float nexttoward(float __from, double __to) {
+  return __builtin_nexttowardf(__from, __to);
+}
+__DEVICE__ double nexttoward(double __from, double __to) {
+  return __builtin_nexttoward(__from, __to);
+}
+__DEVICE__ float nexttowardf(float __from, double __to) {
+  return __builtin_nexttowardf(__from, __to);
+}
+__DEVICE__ float pow(float __base, float __exp) {
+  return ::powf(__base, __exp);
+}
+__DEVICE__ float pow(float __base, int __iexp) {
+  return ::powif(__base, __iexp);
+}
+__DEVICE__ double pow(double __base, int __iexp) {
+  return ::powi(__base, __iexp);
+}
+__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
+__DEVICE__ float sin(float __x) { return ::sinf(__x); }
+__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
+__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
+__DEVICE__ float tan(float __x) { return ::tanf(__x); }
+__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
+
+// Now we've defined everything we promised we'd define in
+// __clang_cuda_math_forward_declares.h.  We need to do two additional things to
+// fix up our math functions.
+//
+// 1) Define __device__ overloads for e.g. sin(int).  The CUDA headers define
+//    only sin(float) and sin(double), which means that e.g. sin(0) is
+//    ambiguous.
+//
+// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
+//    std.  These are defined in the CUDA headers in the global namespace,
+//    independent of everything else we've done here.
+
+// We can't use std::enable_if, because we want to be pre-C++11 compatible.  But
+// we go ahead and unconditionally define functions that are only available when
+// compiling for C++11 to match the behavior of the CUDA headers.
+template<bool __B, class __T = void>
+struct __clang_cuda_enable_if {};
+
+template <class __T> struct __clang_cuda_enable_if<true, __T> {
+  typedef __T type;
+};
+
+// Defines an overload of __fn that accepts one integral argument, calls
+// __fn((double)x), and returns __retty.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn)                      \
+  template <typename __T>                                                      \
+  __DEVICE__                                                                   \
+      typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+      __fn(__T __x) {                                                          \
+    return ::__fn((double)__x);                                                \
+  }
+
+// Defines an overload of __fn that accepts one two arithmetic arguments, calls
+// __fn((double)x, (double)y), and returns a double.
+//
+// Note this is different from OVERLOAD_1, which generates an overload that
+// accepts only *integral* arguments.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn)                      \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __clang_cuda_enable_if<                                  \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      __retty>::type                                                           \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
+
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
+
+// Overloads for functions that don't match the patterns expected by
+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized &&
+        std::numeric_limits<__T3>::is_specialized,
+    double>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+  return std::fma((double)__x, (double)__y, (double)__z);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+frexp(__T __x, int *__exp) {
+  return std::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+ldexp(__T __x, int __exp) {
+  return std::ldexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+nexttoward(__T __from, double __to) {
+  return std::nexttoward((double)__from, __to);
+}
+
+template <typename __T1, typename __T2>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized,
+    double>::type
+remquo(__T1 __x, __T2 __y, int *__quo) {
+  return std::remquo((double)__x, (double)__y, __quo);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbln(__T __x, long __exp) {
+  return std::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbn(__T __x, int __exp) {
+  return std::scalbn((double)__x, __exp);
+}
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that CUDA defines in its headers into
+// namespace std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+using ::nexttowardf;
+using ::nexttowardf;
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#undef __DEVICE__
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_complex_builtins.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_complex_builtins.h
new file mode 100644
index 000000000..beef7deff
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_complex_builtins.h
@@ -0,0 +1,203 @@
+/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
+#define __CLANG_CUDA_COMPLEX_BUILTINS
+
+// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
+// libgcc functions that clang assumes are available when compiling c99 complex
+// operations.  (These implementations come from libc++, and have been modified
+// to work with CUDA.)
+
+extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
+                                                      double __c, double __d) {
+  double __ac = __a * __c;
+  double __bd = __b * __d;
+  double __ad = __a * __d;
+  double __bc = __b * __c;
+  double _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
+      // a device overload (and isn't constexpr before C++11, naturally).
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  float __ac = __a * __c;
+  float __bd = __b * __d;
+  float __ad = __a * __d;
+  float __bc = __b * __c;
+  float _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
+                                                      double __c, double __d) {
+  int __ilogbw = 0;
+  // Can't use std::max, because that's defined in <algorithm>, and we don't
+  // want to pull that in for every compile.  The CUDA headers define
+  // ::max(float, float) and ::max(double, double), which is sufficient for us.
+  double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  double __denom = __c * __c + __d * __d;
+  double _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
+      __real__(z) = 0.0 * (__a * __c + __b * __d);
+      __imag__(z) = 0.0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  int __ilogbw = 0;
+  float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  float __denom = __c * __c + __d * __d;
+  float _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      __real__(z) = 0 * (__a * __c + __b * __d);
+      __imag__(z) = 0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+#endif // __CLANG_CUDA_COMPLEX_BUILTINS
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_intrinsics.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_intrinsics.h
new file mode 100644
index 000000000..b43ce21d0
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_intrinsics.h
@@ -0,0 +1,322 @@
+/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_INTRINSICS_H__
+#define __CLANG_CUDA_INTRINSICS_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// sm_30 intrinsics: __shfl_{up,down,xor}.
+
+#define __SM_30_INTRINSICS_H__
+#define __SM_30_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#pragma push_macro("__MAKE_SHUFFLES")
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask)    \
+  inline __device__ int __FnName(int __val, int __offset,                      \
+                                 int __width = warpSize) {                     \
+    return __IntIntrinsic(__val, __offset,                                     \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(float __val, int __offset,                  \
+                                   int __width = warpSize) {                   \
+    return __FloatIntrinsic(__val, __offset,                                   \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __val, int __offset,    \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(static_cast<int>(__val), __offset, __width));               \
+  }                                                                            \
+  inline __device__ long long __FnName(long long __val, int __offset,          \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
+    __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned long long __val, int __offset, int __width = warpSize) {        \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        static_cast<unsigned long long>(__val), __offset, __width));           \
+  }                                                                            \
+  inline __device__ double __FnName(double __val, int __offset,                \
+                                    int __width = warpSize) {                  \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp = ::__FnName(__tmp, __offset, __width);                              \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
+  }
+
+__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
+__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
+__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
+
+#pragma pop_macro("__MAKE_SHUFFLES")
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
+
+// Prevent the vanilla sm_32 intrinsics header from being included.
+#define __SM_32_INTRINSICS_H__
+#define __SM_32_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
+inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
+inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
+inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
+inline __device__ long long __ldg(const long long *ptr) {
+  return __nvvm_ldg_ll(ptr);
+}
+inline __device__ unsigned char __ldg(const unsigned char *ptr) {
+  return __nvvm_ldg_uc(ptr);
+}
+inline __device__ unsigned short __ldg(const unsigned short *ptr) {
+  return __nvvm_ldg_us(ptr);
+}
+inline __device__ unsigned int __ldg(const unsigned int *ptr) {
+  return __nvvm_ldg_ui(ptr);
+}
+inline __device__ unsigned long __ldg(const unsigned long *ptr) {
+  return __nvvm_ldg_ul(ptr);
+}
+inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
+  return __nvvm_ldg_ull(ptr);
+}
+inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
+inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
+
+inline __device__ char2 __ldg(const char2 *ptr) {
+  typedef char c2 __attribute__((ext_vector_type(2)));
+  // We can assume that ptr is aligned at least to char2's alignment, but the
+  // load will assume that ptr is aligned to char2's alignment.  This is only
+  // safe if alignof(c2) <= alignof(char2).
+  c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
+  char2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ char4 __ldg(const char4 *ptr) {
+  typedef char c4 __attribute__((ext_vector_type(4)));
+  c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
+  char4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ short2 __ldg(const short2 *ptr) {
+  typedef short s2 __attribute__((ext_vector_type(2)));
+  s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
+  short2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ short4 __ldg(const short4 *ptr) {
+  typedef short s4 __attribute__((ext_vector_type(4)));
+  s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
+  short4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ int2 __ldg(const int2 *ptr) {
+  typedef int i2 __attribute__((ext_vector_type(2)));
+  i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
+  int2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ int4 __ldg(const int4 *ptr) {
+  typedef int i4 __attribute__((ext_vector_type(4)));
+  i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
+  int4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ longlong2 __ldg(const longlong2 *ptr) {
+  typedef long long ll2 __attribute__((ext_vector_type(2)));
+  ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
+  longlong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ uchar2 __ldg(const uchar2 *ptr) {
+  typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
+  uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
+  uchar2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uchar4 __ldg(const uchar4 *ptr) {
+  typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
+  uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
+  uchar4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ushort2 __ldg(const ushort2 *ptr) {
+  typedef unsigned short us2 __attribute__((ext_vector_type(2)));
+  us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
+  ushort2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ ushort4 __ldg(const ushort4 *ptr) {
+  typedef unsigned short us4 __attribute__((ext_vector_type(4)));
+  us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
+  ushort4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ uint2 __ldg(const uint2 *ptr) {
+  typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
+  ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
+  uint2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uint4 __ldg(const uint4 *ptr) {
+  typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
+  ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
+  uint4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
+  typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
+  ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
+  ulonglong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ float2 __ldg(const float2 *ptr) {
+  typedef float f2 __attribute__((ext_vector_type(2)));
+  f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
+  float2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ float4 __ldg(const float4 *ptr) {
+  typedef float f4 __attribute__((ext_vector_type(4)));
+  f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
+  float4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ double2 __ldg(const double2 *ptr) {
+  typedef double d2 __attribute__((ext_vector_type(2)));
+  d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
+  double2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+// TODO: Implement these as intrinsics, so the backend can work its magic on
+// these.  Alternatively, we could implement these as plain C and try to get
+// llvm to recognize the relevant patterns.
+inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned ret;
+  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(ret)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return ret;
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_math_forward_declares.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_math_forward_declares.h
new file mode 100644
index 000000000..49c805151
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_math_forward_declares.h
@@ -0,0 +1,286 @@
+/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// This file forward-declares of some math functions we (or the CUDA headers)
+// will define later.  We need to do this, and do it before cmath is included,
+// because the standard library may have constexpr math functions.  In the
+// absence of a prior __device__ decl, those constexpr functions may become
+// implicitly host+device.  host+device functions can't be overloaded, so that
+// would preclude the use of our own __device__ overloads for these functions.
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__                                                             \
+  static __inline__ __attribute__((always_inline)) __attribute__((device))
+
+__DEVICE__ double abs(double);
+__DEVICE__ float abs(float);
+__DEVICE__ int abs(int);
+__DEVICE__ long abs(long);
+__DEVICE__ long long abs(long long);
+__DEVICE__ double acos(double);
+__DEVICE__ float acos(float);
+__DEVICE__ double acosh(double);
+__DEVICE__ float acosh(float);
+__DEVICE__ double asin(double);
+__DEVICE__ float asin(float);
+__DEVICE__ double asinh(double);
+__DEVICE__ float asinh(float);
+__DEVICE__ double atan2(double, double);
+__DEVICE__ float atan2(float, float);
+__DEVICE__ double atan(double);
+__DEVICE__ float atan(float);
+__DEVICE__ double atanh(double);
+__DEVICE__ float atanh(float);
+__DEVICE__ double cbrt(double);
+__DEVICE__ float cbrt(float);
+__DEVICE__ double ceil(double);
+__DEVICE__ float ceil(float);
+__DEVICE__ double copysign(double, double);
+__DEVICE__ float copysign(float, float);
+__DEVICE__ double cos(double);
+__DEVICE__ float cos(float);
+__DEVICE__ double cosh(double);
+__DEVICE__ float cosh(float);
+__DEVICE__ double erfc(double);
+__DEVICE__ float erfc(float);
+__DEVICE__ double erf(double);
+__DEVICE__ float erf(float);
+__DEVICE__ double exp2(double);
+__DEVICE__ float exp2(float);
+__DEVICE__ double exp(double);
+__DEVICE__ float exp(float);
+__DEVICE__ double expm1(double);
+__DEVICE__ float expm1(float);
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
+__DEVICE__ double fdim(double, double);
+__DEVICE__ float fdim(float, float);
+__DEVICE__ double floor(double);
+__DEVICE__ float floor(float);
+__DEVICE__ double fma(double, double, double);
+__DEVICE__ float fma(float, float, float);
+__DEVICE__ double fmax(double, double);
+__DEVICE__ float fmax(float, float);
+__DEVICE__ double fmin(double, double);
+__DEVICE__ float fmin(float, float);
+__DEVICE__ double fmod(double, double);
+__DEVICE__ float fmod(float, float);
+__DEVICE__ int fpclassify(double);
+__DEVICE__ int fpclassify(float);
+__DEVICE__ double frexp(double, int *);
+__DEVICE__ float frexp(float, int *);
+__DEVICE__ double hypot(double, double);
+__DEVICE__ float hypot(float, float);
+__DEVICE__ int ilogb(double);
+__DEVICE__ int ilogb(float);
+__DEVICE__ bool isfinite(double);
+__DEVICE__ bool isfinite(float);
+__DEVICE__ bool isgreater(double, double);
+__DEVICE__ bool isgreaterequal(double, double);
+__DEVICE__ bool isgreaterequal(float, float);
+__DEVICE__ bool isgreater(float, float);
+__DEVICE__ bool isinf(double);
+__DEVICE__ bool isinf(float);
+__DEVICE__ bool isless(double, double);
+__DEVICE__ bool islessequal(double, double);
+__DEVICE__ bool islessequal(float, float);
+__DEVICE__ bool isless(float, float);
+__DEVICE__ bool islessgreater(double, double);
+__DEVICE__ bool islessgreater(float, float);
+__DEVICE__ bool isnan(double);
+__DEVICE__ bool isnan(float);
+__DEVICE__ bool isnormal(double);
+__DEVICE__ bool isnormal(float);
+__DEVICE__ bool isunordered(double, double);
+__DEVICE__ bool isunordered(float, float);
+__DEVICE__ long labs(long);
+__DEVICE__ double ldexp(double, int);
+__DEVICE__ float ldexp(float, int);
+__DEVICE__ double lgamma(double);
+__DEVICE__ float lgamma(float);
+__DEVICE__ long long llabs(long long);
+__DEVICE__ long long llrint(double);
+__DEVICE__ long long llrint(float);
+__DEVICE__ double log10(double);
+__DEVICE__ float log10(float);
+__DEVICE__ double log1p(double);
+__DEVICE__ float log1p(float);
+__DEVICE__ double log2(double);
+__DEVICE__ float log2(float);
+__DEVICE__ double logb(double);
+__DEVICE__ float logb(float);
+__DEVICE__ double log(double);
+__DEVICE__ float log(float);
+__DEVICE__ long lrint(double);
+__DEVICE__ long lrint(float);
+__DEVICE__ long lround(double);
+__DEVICE__ long lround(float);
+__DEVICE__ long long llround(float); // No llround(double).
+__DEVICE__ double modf(double, double *);
+__DEVICE__ float modf(float, float *);
+__DEVICE__ double nan(const char *);
+__DEVICE__ float nanf(const char *);
+__DEVICE__ double nearbyint(double);
+__DEVICE__ float nearbyint(float);
+__DEVICE__ double nextafter(double, double);
+__DEVICE__ float nextafter(float, float);
+__DEVICE__ double nexttoward(double, double);
+__DEVICE__ float nexttoward(float, double);
+__DEVICE__ float nexttowardf(float, double);
+__DEVICE__ double pow(double, double);
+__DEVICE__ double pow(double, int);
+__DEVICE__ float pow(float, float);
+__DEVICE__ float pow(float, int);
+__DEVICE__ double remainder(double, double);
+__DEVICE__ float remainder(float, float);
+__DEVICE__ double remquo(double, double, int *);
+__DEVICE__ float remquo(float, float, int *);
+__DEVICE__ double rint(double);
+__DEVICE__ float rint(float);
+__DEVICE__ double round(double);
+__DEVICE__ float round(float);
+__DEVICE__ double scalbln(double, long);
+__DEVICE__ float scalbln(float, long);
+__DEVICE__ double scalbn(double, int);
+__DEVICE__ float scalbn(float, int);
+__DEVICE__ bool signbit(double);
+__DEVICE__ bool signbit(float);
+__DEVICE__ double sin(double);
+__DEVICE__ float sin(float);
+__DEVICE__ double sinh(double);
+__DEVICE__ float sinh(float);
+__DEVICE__ double sqrt(double);
+__DEVICE__ float sqrt(float);
+__DEVICE__ double tan(double);
+__DEVICE__ float tan(float);
+__DEVICE__ double tanh(double);
+__DEVICE__ float tanh(float);
+__DEVICE__ double tgamma(double);
+__DEVICE__ float tgamma(float);
+__DEVICE__ double trunc(double);
+__DEVICE__ float trunc(float);
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+using ::abs;
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isinf;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnan;
+using ::isnormal;
+using ::isunordered;
+using ::labs;
+using ::ldexp;
+using ::lgamma;
+using ::llabs;
+using ::llrint;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::llround;
+using ::modf;
+using ::nan;
+using ::nanf;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#pragma pop_macro("__DEVICE__")
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_runtime_wrapper.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_runtime_wrapper.h
new file mode 100644
index 000000000..931d44b69
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__clang_cuda_runtime_wrapper.h
@@ -0,0 +1,347 @@
+/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ * CUDA headers are implemented in a way that currently makes it
+ * impossible for user code to #include directly when compiling with
+ * Clang. They present different view of CUDA-supplied functions
+ * depending on where in NVCC's compilation pipeline the headers are
+ * included. Neither of these modes provides function definitions with
+ * correct attributes, so we use preprocessor to force the headers
+ * into a form that Clang can use.
+ *
+ * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
+ * this file during every CUDA compilation.
+ */
+
+#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
+#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
+
+#if defined(__CUDA__) && defined(__clang__)
+
+// Include some forward declares that must come before cmath.
+#include <__clang_cuda_math_forward_declares.h>
+
+// Include some standard headers to avoid CUDA headers including them
+// while some required macros (like __THROW) are in a weird state.
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+
+// Preserve common macros that will be changed below by us or by CUDA
+// headers.
+#pragma push_macro("__THROW")
+#pragma push_macro("__CUDA_ARCH__")
+
+// WARNING: Preprocessor hacks below are based on specific details of
+// CUDA-7.x headers and are not expected to work with any other
+// version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
+#error "Unsupported CUDA version!"
+#endif
+
+// Make largest subset of device functions available during host
+// compilation -- SM_35 for the time being.
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#include "__clang_cuda_builtin_vars.h"
+
+// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
+// has taken care of builtin variables declared in the file.
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// {math,device}_functions.h only have declarations of the
+// functions. We don't need them as we're going to pull in their
+// definitions from .hpp files.
+#define __DEVICE_FUNCTIONS_H__
+#define __MATH_FUNCTIONS_H__
+#define __COMMON_FUNCTIONS_H__
+
+#undef __CUDACC__
+#define __CUDABE__
+// Disables definitions of device-side runtime support stubs in
+// cuda_device_runtime_api.h
+#include "driver_types.h"
+#include "host_config.h"
+#include "host_defines.h"
+
+#undef __CUDABE__
+#define __CUDACC__
+#include "cuda_runtime.h"
+
+#undef __CUDACC__
+#define __CUDABE__
+
+// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
+// not have at the moment. Emulate them with a builtin memcpy/memset.
+#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
+#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
+
+#include "crt/device_runtime.h"
+#include "crt/host_runtime.h"
+// device_runtime.h defines __cxa_* macros that will conflict with
+// cxxabi.h.
+// FIXME: redefine these as __device__ functions.
+#undef __cxa_vec_ctor
+#undef __cxa_vec_cctor
+#undef __cxa_vec_dtor
+#undef __cxa_vec_new
+#undef __cxa_vec_new2
+#undef __cxa_vec_new3
+#undef __cxa_vec_delete2
+#undef __cxa_vec_delete
+#undef __cxa_vec_delete3
+#undef __cxa_pure_virtual
+
+// math_functions.hpp expects this host function be defined on MacOS, but it
+// ends up not being there because of the games we play here.  Just define it
+// ourselves; it's simple enough.
+#ifdef __APPLE__
+inline __host__ double __signbitd(double x) {
+  return std::signbit(x);
+}
+#endif
+
+// We need decls for functions in CUDA's libdevice with __device__
+// attribute only. Alas they come either as __host__ __device__ or
+// with no attributes at all. To work around that, define __CUDA_RTC__
+// which produces HD variant and undef __host__ which gives us desided
+// decls with __device__ attribute.
+#pragma push_macro("__host__")
+#define __host__
+#define __CUDACC_RTC__
+#include "device_functions_decls.h"
+#undef __CUDACC_RTC__
+
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
+// Previous versions used to check whether they are defined or not.
+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
+// here to detect the switch.
+
+#if defined(CU_DEVICE_INVALID)
+#if !defined(__USE_FAST_MATH__)
+#define __USE_FAST_MATH__ 0
+#endif
+
+#if !defined(__CUDA_PREC_DIV)
+#define __CUDA_PREC_DIV 0
+#endif
+#endif
+
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#include "device_functions.hpp"
+
+// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
+// get the slow-but-accurate or fast-but-inaccurate versions of functions like
+// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+//
+// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
+// slow divides), so we need to scope our define carefully here.
+#pragma push_macro("__USE_FAST_MATH__")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __USE_FAST_MATH__ 1
+#endif
+#include "math_functions.hpp"
+#pragma pop_macro("__USE_FAST_MATH__")
+
+#include "math_functions_dbl_ptx3.hpp"
+#pragma pop_macro("__forceinline__")
+
+// Pull in host-only functions that are only available when neither
+// __CUDACC__ nor __CUDABE__ are defined.
+#undef __MATH_FUNCTIONS_HPP__
+#undef __CUDABE__
+#include "math_functions.hpp"
+// Alas, additional overloads for these functions are hard to get to.
+// Considering that we only need these overloads for a few functions,
+// we can provide them here.
+static inline float rsqrt(float __a) { return rsqrtf(__a); }
+static inline float rcbrt(float __a) { return rcbrtf(__a); }
+static inline float sinpi(float __a) { return sinpif(__a); }
+static inline float cospi(float __a) { return cospif(__a); }
+static inline void sincospi(float __a, float *__b, float *__c) {
+  return sincospif(__a, __b, __c);
+}
+static inline float erfcinv(float __a) { return erfcinvf(__a); }
+static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
+static inline float normcdf(float __a) { return normcdff(__a); }
+static inline float erfcx(float __a) { return erfcxf(__a); }
+
+// For some reason single-argument variant is not always declared by
+// CUDA headers. Alas, device_functions.hpp included below needs it.
+static inline __device__ void __brkpt(int __c) { __brkpt(); }
+
+// Now include *.hpp with definitions of various GPU functions.  Alas,
+// a lot of thins get declared/defined with __host__ attribute which
+// we don't want and we have to define it out. We also have to include
+// {device,math}_functions.hpp again in order to extract the other
+// branch of #if/else inside.
+
+#define __host__
+#undef __CUDABE__
+#define __CUDACC__
+#undef __DEVICE_FUNCTIONS_HPP__
+#include "device_atomic_functions.hpp"
+#include "device_functions.hpp"
+#include "sm_20_atomic_functions.hpp"
+#include "sm_20_intrinsics.hpp"
+#include "sm_32_atomic_functions.hpp"
+
+// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
+// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
+// define them using builtins so that the optimizer can reason about and across
+// these instructions.  In particular, using intrinsics for ldg gets us the
+// [addr+imm] addressing mode, which, although it doesn't actually exist in the
+// hardware, seems to generate faster machine code because ptxas can more easily
+// reason about our code.
+
+#if CUDA_VERSION >= 8000
+#include "sm_60_atomic_functions.hpp"
+#include "sm_61_intrinsics.hpp"
+#endif
+
+#undef __MATH_FUNCTIONS_HPP__
+
+// math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
+// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
+// math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
+// conditional on __GNUC__.  :)
+#pragma push_macro("signbit")
+#pragma push_macro("__GNUC__")
+#undef __GNUC__
+#define signbit __ignored_cuda_signbit
+#include "math_functions.hpp"
+#pragma pop_macro("__GNUC__")
+#pragma pop_macro("signbit")
+
+#pragma pop_macro("__host__")
+
+#include "texture_indirect_functions.h"
+
+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
+#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("__THROW")
+
+// Set up compiler macros expected to be seen during compilation.
+#undef __CUDABE__
+#define __CUDACC__
+
+extern "C" {
+// Device-side CUDA system calls.
+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
+// We need these declarations and wrappers for device-side
+// malloc/free/printf calls to work without relying on
+// -fcuda-disable-target-call-checks option.
+__device__ int vprintf(const char *, const char *);
+__device__ void free(void *) __attribute((nothrow));
+__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
+__device__ void __assertfail(const char *__message, const char *__file,
+                             unsigned __line, const char *__function,
+                             size_t __charSize) __attribute__((noreturn));
+
+// In order for standard assert() macro on linux to work we need to
+// provide device-side __assert_fail()
+__device__ static inline void __assert_fail(const char *__message,
+                                            const char *__file, unsigned __line,
+                                            const char *__function) {
+  __assertfail(__message, __file, __line, __function, sizeof(char));
+}
+
+// Clang will convert printf into vprintf, but we still need
+// device-side declaration for it.
+__device__ int printf(const char *, ...);
+} // extern "C"
+
+// We also need device-side std::malloc and std::free.
+namespace std {
+__device__ static inline void free(void *__ptr) { ::free(__ptr); }
+__device__ static inline void *malloc(size_t __size) {
+  return ::malloc(__size);
+}
+} // namespace std
+
+// Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
+// come after we've pulled in the definition of uint3 and dim3.
+
+__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+#include <__clang_cuda_cmath.h>
+#include <__clang_cuda_intrinsics.h>
+#include <__clang_cuda_complex_builtins.h>
+
+// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
+// mode, giving them their "proper" types of dim3 and uint3.  This is
+// incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
+// hack, force-include the header (nvcc doesn't include it by default) but
+// redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
+// only used here for the redeclarations of blockDim and threadIdx.)
+#pragma push_macro("dim3")
+#pragma push_macro("uint3")
+#define dim3 __cuda_builtin_blockDim_t
+#define uint3 __cuda_builtin_threadIdx_t
+#include "curand_mtgp32_kernel.h"
+#pragma pop_macro("dim3")
+#pragma pop_macro("uint3")
+#pragma pop_macro("__USE_FAST_MATH__")
+
+#endif // __CUDA__
+#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__stddef_max_align_t.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__stddef_max_align_t.h
new file mode 100644
index 000000000..1e10ca986
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__stddef_max_align_t.h
@@ -0,0 +1,43 @@
+/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
+ *
+ * Copyright (c) 2014 Chandler Carruth
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_MAX_ALIGN_T_DEFINED
+#define __CLANG_MAX_ALIGN_T_DEFINED
+
+#if defined(_MSC_VER)
+typedef double max_align_t;
+#elif defined(__APPLE__)
+typedef long double max_align_t;
+#else
+// Define 'max_align_t' to match the GCC definition.
+typedef struct {
+  long long __clang_max_align_nonce1
+      __attribute__((__aligned__(__alignof__(long long))));
+  long double __clang_max_align_nonce2
+      __attribute__((__aligned__(__alignof__(long double))));
+} max_align_t;
+#endif
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__wmmintrin_aes.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__wmmintrin_aes.h
new file mode 100644
index 000000000..3a2ee1b2e
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__wmmintrin_aes.h
@@ -0,0 +1,151 @@
+/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_AES_H
+#define _WMMINTRIN_AES_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
+
+/// \brief Performs a single round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenc_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs the final round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs a single round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdec_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs the final round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdeclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Applies the AES InvMixColumns() transformation to an expanded key
+///    contained in the source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the expanded key.
+/// \returns A 128-bit integer vector containing the transformed value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesimc_si128(__m128i __V)
+{
+  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
+}
+
+/// \brief Generates a round key for AES encyption, operating on 128-bit data
+///    specified in the first source operand and using an 8-bit round constant
+///    specified by the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
+///
+/// \param C
+///    A 128-bit integer vector that is used to generate the AES encryption key.
+/// \param R
+///    An 8-bit round constant used to generate the AES encryption key.
+/// \returns A 128-bit round key for AES encryption.
+#define _mm_aeskeygenassist_si128(C, R) \
+  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif  /* _WMMINTRIN_AES_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__wmmintrin_pclmul.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__wmmintrin_pclmul.h
new file mode 100644
index 000000000..e9c6a9f6d
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/__wmmintrin_pclmul.h
@@ -0,0 +1,57 @@
+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_PCLMUL_H
+#define _WMMINTRIN_PCLMUL_H
+
+/// \brief Multiplies two 64-bit integer values, which are selected from source
+///    operands using the immediate-value operand. The multiplication is a
+///    carry-less multiplication, and the 128-bit integer product is stored in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
+///
+/// \param __X
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __Y
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __I
+///    An immediate value specifying which 64-bit values to select from the
+///    operands. Bit 0 is used to select a value from operand \a __X, and bit
+///    4 is used to select a value from operand \a __Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
+/// \returns The 128-bit integer vector containing the result of the carry-less
+///    multiplication of the selected 64-bit values.
+#define _mm_clmulepi64_si128(__X, __Y, __I) \
+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
+                                        (__v2di)(__m128i)(__Y), (char)(__I)))
+
+#endif /* _WMMINTRIN_PCLMUL_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/adxintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/adxintrin.h
new file mode 100644
index 000000000..ee3472841
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/adxintrin.h
@@ -0,0 +1,86 @@
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Intrinsics that are available only if __ADX__ defined */
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+               unsigned int *__p)
+{
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+/* Intrinsics that are also available if __ADX__ undefined */
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
+}
+#endif
+
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADXINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/altivec.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/altivec.h
new file mode 100644
index 000000000..a01d9d837
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/altivec.h
@@ -0,0 +1,16733 @@
+/*===---- altivec.h - Standard header for type generic math ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __ALTIVEC_H
+#define __ALTIVEC_H
+
+#ifndef __ALTIVEC__
+#error "AltiVec support not enabled"
+#endif
+
+/* Constants for mapping CR6 bits to predicate result. */
+
+#define __CR6_EQ 0
+#define __CR6_EQ_REV 1
+#define __CR6_LT 2
+#define __CR6_LT_REV 3
+
+/* Constants for vec_test_data_class */
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+                                  __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_ZERO_N (1<<2)
+#define __VEC_CLASS_FP_ZERO_P (1<<3)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P           | \
+                             __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_INFINITY_N (1<<4)
+#define __VEC_CLASS_FP_INFINITY_P (1<<5)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P   | \
+                                 __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_NAN (1<<6)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN        | \
+                                   __VEC_CLASS_FP_SUBNORMAL  | \
+                                   __VEC_CLASS_FP_ZERO       | \
+                                   __VEC_CLASS_FP_INFINITY)
+
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+#ifdef __POWER9_VECTOR__
+#include <stddef.h>
+#endif
+
+static __inline__ vector signed char __ATTRS_o_ai vec_perm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c);
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c);
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c);
+
+static __inline__ vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                                     vector signed short __b,
+                                                     vector unsigned char __c);
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c);
+
+static __inline__ vector bool short __ATTRS_o_ai vec_perm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c);
+
+static __inline__ vector pixel __ATTRS_o_ai vec_perm(vector pixel __a,
+                                                     vector pixel __b,
+                                                     vector unsigned char __c);
+
+static __inline__ vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                                   vector signed int __b,
+                                                   vector unsigned char __c);
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_perm(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned char __c);
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c);
+
+static __inline__ vector float __ATTRS_o_ai vec_perm(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned char __c);
+
+#ifdef __VSX__
+static __inline__ vector long long __ATTRS_o_ai
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c);
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c);
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c);
+
+static __inline__ vector double __ATTRS_o_ai vec_perm(vector double __a,
+                                                      vector double __b,
+                                                      vector unsigned char __c);
+#endif
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b);
+
+/* vec_abs */
+
+#define __builtin_altivec_abs_v16qi vec_abs
+#define __builtin_altivec_abs_v8hi vec_abs
+#define __builtin_altivec_abs_v4si vec_abs
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_abs(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(__a, -__a);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_abs(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(__a, -__a);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_abs(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(__a, -__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_abs(vector signed long long __a) {
+  return __builtin_altivec_vmaxsd(__a, -__a);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_abs(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvabssp(__a);
+#else
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)(0x7FFFFFFF);
+  return (vector float)__res;
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_abs(vector double __a) {
+  return __builtin_vsx_xvabsdp(__a);
+}
+#endif
+
+/* vec_abss */
+#define __builtin_altivec_abss_v16qi vec_abss
+#define __builtin_altivec_abss_v8hi vec_abss
+#define __builtin_altivec_abss_v4si vec_abss
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_abss(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(
+      __a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_abss(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(
+      __a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_abss(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(
+      __a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
+}
+
+/* vec_absd */
+#if defined(__POWER9_VECTOR__)
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_absd(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vabsdub(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_absd(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vabsduh(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_absd(vector unsigned int __a,  vector unsigned int __b) {
+  return __builtin_altivec_vabsduw(__a, __b);
+}
+
+#endif /* End __POWER9_VECTOR__ */
+
+/* vec_add */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector signed char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector unsigned char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_add(vector short __a,
+                                                    vector short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_add(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_add(vector short __a,
+                                                    vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector unsigned short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_add(vector int __a,
+                                                  vector int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_add(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_add(vector int __a,
+                                                  vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector unsigned int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_add(vector signed long long __a, vector signed long long __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_add(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a + __b;
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_add(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_add(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+static __inline__ vector float __ATTRS_o_ai vec_add(vector float __a,
+                                                    vector float __b) {
+  return __a + __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_add(vector double __a,
+                                                     vector double __b) {
+  return __a + __b;
+}
+#endif // __VSX__
+
+/* vec_adde */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_adde(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+#endif
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_adde(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adde(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
+/* vec_addec */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_addec(vector signed __int128 __a, vector signed __int128 __b,
+          vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+          vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addec(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+
+  signed int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempa = (unsigned int) __a[i];
+    unsigned int __tempb = (unsigned int) __b[i];
+    unsigned int __tempc = (unsigned int) __c[i];
+    __tempc = __tempc & 0x00000001;
+    unsigned long long __longa = (unsigned long long) __tempa;
+    unsigned long long __longb = (unsigned long long) __tempb;
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector signed int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addec(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+
+  unsigned int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempc = __c[i] & 1;
+    unsigned long long __longa = (unsigned long long) __a[i];
+    unsigned long long __longb = (unsigned long long) __b[i];
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector unsigned int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
+#endif
+
+/* vec_vaddubm */
+
+#define __builtin_altivec_vaddubm vec_vaddubm
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector signed char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector unsigned char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+/* vec_vadduhm */
+
+#define __builtin_altivec_vadduhm vec_vadduhm
+
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                                        vector short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector bool short __a,
+                                                        vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                                        vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+/* vec_vadduwm */
+
+#define __builtin_altivec_vadduwm vec_vadduwm
+
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                                      vector int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
+                                                      vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                                      vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector unsigned int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+/* vec_vaddfp */
+
+#define __builtin_altivec_vaddfp vec_vaddfp
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vaddfp(vector float __a, vector float __b) {
+  return __a + __b;
+}
+
+/* vec_addc */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addc(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vaddcuw((vector unsigned int)__a,
+                                                      (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_addc(vector signed __int128 __a, vector signed __int128 __b) {
+  return (vector signed __int128)__builtin_altivec_vaddcuq(
+      (vector unsigned __int128)__a, (vector unsigned __int128)__b);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_addc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vaddcuw */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vaddcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_adds */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector bool short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector bool int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vaddsbs */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vaddubs */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vaddshs */
+
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector bool short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                                        vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+/* vec_vadduhs */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vaddsws */
+
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector bool int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                                      vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+/* vec_vadduws */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vadduqm */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vadduqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vadduqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+
+/* vec_vaddeuqm */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+/* vec_vaddcuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vaddcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vaddcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+/* vec_vaddecuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vaddecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vaddecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_and */
+
+#define __builtin_altivec_vand vec_and
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector signed char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector unsigned char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_and(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_and(vector short __a,
+                                                    vector short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_and(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_and(vector short __a,
+                                                    vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector unsigned short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_and(vector bool short __a, vector bool short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_and(vector int __a,
+                                                  vector int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_and(vector int __a,
+                                                  vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector unsigned int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_and(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_and(vector bool int __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_and(vector float __a,
+                                                    vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_and(vector bool long long __a,
+                                                     vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_and(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_and(vector double __a,
+                                                     vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_and(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_and(vector signed long long __a, vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_vand */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector signed char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector unsigned char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                                         vector bool char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                                     vector short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                                     vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                                     vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector unsigned short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector bool short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector int __a,
+                                                   vector int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                   vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector int __a,
+                                                   vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector unsigned int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                        vector bool int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                                     vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_andc */
+
+#define __builtin_altivec_vandc vec_andc
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                                         vector bool char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                                     vector short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                                     vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                                     vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector int __a,
+                                                   vector int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                   vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector int __a,
+                                                   vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                        vector bool int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                                     vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_andc(vector bool long long __a,
+                                                      vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_andc(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_andc(vector double __a,
+                                                      vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_vandc */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                                      vector short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                                      vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                                      vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector int __a,
+                                                    vector int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                    vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector int __a,
+                                                    vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                         vector bool int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                                      vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                      vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                                      vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_avg */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_avg(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_avg(vector short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_avg(vector int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_vavgsb */
+
+static __inline__ vector signed char __attribute__((__always_inline__))
+vec_vavgsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+/* vec_vavgub */
+
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+vec_vavgub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+/* vec_vavgsh */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vavgsh(vector short __a, vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+/* vec_vavguh */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vavguh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+/* vec_vavgsw */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vavgsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+/* vec_vavguw */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vavguw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_ceil */
+
+static __inline__ vector float __ATTRS_o_ai vec_ceil(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspip(__a);
+#else
+  return __builtin_altivec_vrfip(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_ceil(vector double __a) {
+  return __builtin_vsx_xvrdpip(__a);
+}
+#endif
+
+/* vec_vrfip */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfip(vector float __a) {
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_cmpb */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_cmpb(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_vcmpbfp */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vcmpbfp(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_cmpeq */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
+                                                           vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a,
+                                                         vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector bool int __a,
+                                                         vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(__a, __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+
+#endif
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
+                                                         vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpeqsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpeqdp(__a, __b);
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+/* vec_cmpne */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector double __a, vector double __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+/* vec_cmpnez */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector signed int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector signed __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector signed long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+#endif
+
+/* vec_cmpgt */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpgt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_cmpgt(vector short __a,
+                                                           vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpgt(vector int __a,
+                                                         vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b);
+}
+#endif
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a,
+                                                         vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgtsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgtdp(__a, __b);
+}
+#endif
+
+/* vec_cmpge */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpge(vector signed char __a, vector signed char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpge(vector signed short __a, vector signed short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpge(vector signed int __a, vector signed int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpge(vector float __a,
+                                                         vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgesp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgedp(__a, __b);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+#endif
+
+/* vec_vcmpgefp */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_vcmpgtsb */
+
+static __inline__ vector bool char __attribute__((__always_inline__))
+vec_vcmpgtsb(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+/* vec_vcmpgtub */
+
+static __inline__ vector bool char __attribute__((__always_inline__))
+vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+/* vec_vcmpgtsh */
+
+static __inline__ vector bool short __attribute__((__always_inline__))
+vec_vcmpgtsh(vector short __a, vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+/* vec_vcmpgtuh */
+
+static __inline__ vector bool short __attribute__((__always_inline__))
+vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+/* vec_vcmpgtsw */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgtsw(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+/* vec_vcmpgtuw */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+/* vec_vcmpgtfp */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgtfp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_cmple */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmple(vector signed char __a, vector signed char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmple(vector signed short __a, vector signed short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmple(vector signed int __a, vector signed int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmple(vector float __a,
+                                                         vector float __b) {
+  return vec_cmpge(__b, __a);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmple(vector double __a, vector double __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+/* vec_cmplt */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmplt(vector signed char __a, vector signed char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_cmplt(vector short __a,
+                                                           vector short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmplt(vector int __a,
+                                                         vector int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmplt(vector float __a,
+                                                         vector float __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmplt(vector double __a, vector double __b) {
+  return vec_cmpgt(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+/* vec_popcnt */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_popcnt(vector signed char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_popcnt(vector signed short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_popcnt(vector signed int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_popcnt(vector signed long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+
+/* vec_cntlz */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cntlz(vector signed char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cntlz(vector unsigned char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cntlz(vector signed short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cntlz(vector unsigned short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cntlz(vector signed int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cntlz(vector unsigned int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cntlz(vector signed long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+
+/* vec_cnttz */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cnttz(vector signed char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cnttz(vector signed short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cnttz(vector signed int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cnttz(vector signed long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+
+/* vec_first_match_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_match_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed char __a, vector signed char __b) {
+  /* Compare the result of the comparison of two vectors with either and OR the
+     result. Either the elements are equal or one will equal the comparison
+     result if either is zero.
+  */
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector signed char)__tmp1, __a) |
+                            vec_cmpeq((vector signed char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned char __a,
+                             vector unsigned char __b) {
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector unsigned char)__tmp1, __a) |
+                            vec_cmpeq((vector unsigned char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed short __a, vector signed short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector signed short)__tmp1, __a) |
+                             vec_cmpeq((vector signed short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned short __a,
+                             vector unsigned short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector unsigned short)__tmp1, __a) |
+                             vec_cmpeq((vector unsigned short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 | vec_cmpeq((vector signed int)__tmp1, __a) |
+                           vec_cmpeq((vector signed int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned int __a, vector unsigned int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 |
+                           vec_cmpeq((vector unsigned int)__tmp1, __a) |
+                           vec_cmpeq((vector unsigned int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)__tmp2);
+#else
+    vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed char __a,
+                                vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned char __a,
+                                vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed short __a,
+                                vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned short __a,
+                                vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned int __a,
+                                vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector double __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp((vector unsigned long long)__a,__b);
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp(__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector float __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp((vector unsigned int)__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp(__a,__b);
+}
+
+#if defined(__powerpc64__)
+static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
+                                                             size_t __b) {
+  return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_len(unsigned char *__a, size_t __b) {
+  return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
+                                                              size_t __b) {
+  return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_len(unsigned short *__a, size_t __b) {
+  return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
+                                                            size_t __b) {
+  return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
+                                                              size_t __b) {
+  return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
+  return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_len(signed __int128 *__a, size_t __b) {
+  return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_len(unsigned __int128 *__a, size_t __b) {
+  return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_len(signed long long *__a, size_t __b) {
+  return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_len(unsigned long long *__a, size_t __b) {
+  return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
+                                                        size_t __b) {
+  return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a,
+                                                          size_t __b) {
+  vector unsigned char __res =
+      (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
+  __res = (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__res, (vector int)__res, __mask);
+#endif
+  return __res;
+}
+
+// vec_xst_len
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned char __a,
+                                                unsigned char *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed char __a,
+                                                signed char *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed short __a,
+                                                signed short *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned short __a,
+                                                unsigned short *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed int __a,
+                                                signed int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned int __a,
+                                                unsigned int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector float __a, float *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed __int128 __a,
+                                                signed __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned __int128 __a,
+                                                unsigned __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed long long __a,
+                                                signed long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned long long __a,
+                                                unsigned long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
+                                                  unsigned char *__b,
+                                                  size_t __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
+  vector unsigned char __res =
+      __builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
+  return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
+#else
+  return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
+#endif
+}
+#endif
+#endif
+
+/* vec_cpsgn */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_cpsgn(vector float __a,
+                                                      vector float __b) {
+  return __builtin_vsx_xvcpsgnsp(__a, __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
+                                                       vector double __b) {
+  return __builtin_vsx_xvcpsgndp(__a, __b);
+}
+#endif
+
+/* vec_ctf */
+
+static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
+                                                    int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai
+vec_ctf(vector unsigned long long __a, int __b) {
+  vector double __ret = __builtin_convertvector(__a, vector double);
+  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_ctf(vector signed long long __a, int __b) {
+  vector double __ret = __builtin_convertvector(__a, vector double);
+  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __ret;
+}
+#endif
+
+/* vec_vcfsx */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vcfsx(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+/* vec_vcfux */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vcfux(vector unsigned int __a, int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_cts */
+
+static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cts(vector double __a, int __b) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector signed long long);
+}
+#endif
+
+/* vec_vctsxs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vctsxs(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_ctu */
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
+                                                           int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+#ifdef __VSX__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_ctu(vector double __a, int __b) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+#endif
+
+/* vec_vctuxs */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vctuxs(vector float __a, int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_signed */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int, vector signed int, unsigned const int __c);
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signed(vector float __a) {
+  return __builtin_convertvector(__a, vector signed int);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signed(vector double __a) {
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_signed2(vector double __a, vector double __b) {
+  return (vector signed int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsxws(__a);
+#endif
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsxws(__a);
+#else
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_unsigned */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int, vector unsigned int, unsigned const int __c);
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsigned(vector float __a) {
+  return __builtin_convertvector(__a, vector unsigned int);
+}
+
+#ifdef __VSX__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_unsigned(vector double __a) {
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_unsigned2(vector double __a, vector double __b) {
+  return (vector unsigned int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpuxws(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpuxws(__a);
+#else
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_float */
+
+static __inline__ vector float __ATTRS_o_ai
+vec_sld(vector float, vector float, unsigned const int __c);
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector signed int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector unsigned int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector signed long long __a, vector signed long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector double __a, vector double __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvsxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvuxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_double */
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector signed long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector unsigned long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvsxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvuxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvspdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector signed int __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector unsigned int __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector float __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector signed int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector unsigned int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector float __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(__a);
+#else
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(__a);
+#else
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(__a);
+#else
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#endif
+}
+#endif
+
+/* vec_div */
+
+/* Integer vector divides (vectors are scalarized, elements divided
+   and the vectors reassembled).
+*/
+static __inline__ vector signed char __ATTRS_o_ai
+vec_div(vector signed char __a, vector signed char __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_div(vector unsigned char __a, vector unsigned char __b) {
+  return __a / __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_div(vector signed short __a, vector signed short __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_div(vector unsigned short __a, vector unsigned short __b) {
+  return __a / __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_div(vector signed int __a, vector signed int __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_div(vector unsigned int __a, vector unsigned int __b) {
+  return __a / __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_div(vector signed long long __a, vector signed long long __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_div(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a / __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_div(vector float __a,
+                                                    vector float __b) {
+  return __a / __b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a,
+                                                     vector double __b) {
+  return __a / __b;
+}
+#endif
+
+/* vec_dss */
+
+static __inline__ void __attribute__((__always_inline__)) vec_dss(int __a) {
+  __builtin_altivec_dss(__a);
+}
+
+/* vec_dssall */
+
+static __inline__ void __attribute__((__always_inline__)) vec_dssall(void) {
+  __builtin_altivec_dssall();
+}
+
+/* vec_dst */
+#define vec_dst(__PTR, __CW, __STR) \
+  __extension__(                    \
+      { __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_dstst */
+#define vec_dstst(__PTR, __CW, __STR) \
+  __extension__(                      \
+      { __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_dststt */
+#define vec_dststt(__PTR, __CW, __STR) \
+  __extension__(                       \
+      { __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_dstt */
+#define vec_dstt(__PTR, __CW, __STR) \
+  __extension__(                     \
+      { __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_eqv */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_eqv(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                  (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_eqv(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                    (vector unsigned int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_eqv(vector bool char __a,
+                                                        vector bool char __b) {
+  return (vector bool char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                (vector unsigned int)__b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_eqv(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                   (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_eqv(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_eqv(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_eqv(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_eqv(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_vsx_xxleqv(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_eqv(vector bool int __a,
+                                                       vector bool int __b) {
+  return (vector bool int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                               (vector unsigned int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_vsx_xxleqv(
+      (vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_vsx_xxleqv(
+      (vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_eqv(vector float __a,
+                                                    vector float __b) {
+  return (vector float)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                            (vector unsigned int)__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_eqv(vector double __a,
+                                                     vector double __b) {
+  return (vector double)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                             (vector unsigned int)__b);
+}
+#endif
+
+/* vec_expte */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_expte(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_vexptefp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vexptefp(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_floor */
+
+static __inline__ vector float __ATTRS_o_ai vec_floor(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspim(__a);
+#else
+  return __builtin_altivec_vrfim(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_floor(vector double __a) {
+  return __builtin_vsx_xvrdpim(__a);
+}
+#endif
+
+/* vec_vrfim */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfim(vector float __a) {
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_ld */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ld(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ld(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_ld(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ld(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_ld(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ld(int __a,
+                                                 const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ld(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_ld(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ld(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lvx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvx(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvx(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvx(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvx(int __a,
+                                                  const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvx(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvx(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lde */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lde(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lde(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lde(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lde(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lde(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lde(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lde(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_lvebx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvebx(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvebx(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+/* vec_lvehx */
+
+static __inline__ vector short __ATTRS_o_ai vec_lvehx(int __a,
+                                                      const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvehx(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+/* vec_lvewx */
+
+static __inline__ vector int __ATTRS_o_ai vec_lvewx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvewx(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvewx(int __a,
+                                                      const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_ldl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_ldl(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ldl(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_ldl(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector pixel *__b) {
+  return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ldl(int __a,
+                                                  const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ldl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_ldl(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ldl(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_lvxl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvxl(int __a,
+                                                   const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvxl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_loge */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_loge(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_vlogefp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vlogefp(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_lvsl */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+/* vec_lvsr */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+/* vec_madd */
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector signed short, vector signed short);
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector unsigned short, vector unsigned short);
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector signed short, vector signed short);
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector unsigned short, vector unsigned short);
+
+static __inline__ vector signed short __ATTRS_o_ai vec_madd(
+    vector signed short __a, vector signed short __b, vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector signed short __b,
+         vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_madd(vector float __a,
+                                                     vector float __b,
+                                                     vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaddasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_madd(vector double __a,
+                                                      vector double __b,
+                                                      vector double __c) {
+  return __builtin_vsx_xvmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vmaddfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vmaddfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_madds */
+
+static __inline__ vector signed short __attribute__((__always_inline__))
+vec_madds(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_vmhaddshs */
+static __inline__ vector signed short __attribute__((__always_inline__))
+vec_vmhaddshs(vector signed short __a, vector signed short __b,
+              vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_msub */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_msub(vector float __a,
+                                                     vector float __b,
+                                                     vector float __c) {
+  return __builtin_vsx_xvmsubasp(__a, __b, __c);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_msub(vector double __a,
+                                                      vector double __b,
+                                                      vector double __c) {
+  return __builtin_vsx_xvmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_max */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_max(vector short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_max(vector bool short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_max(vector short __a,
+                                                    vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_max(vector int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_max(vector bool int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_max(vector int __a,
+                                                  vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd((vector signed long long)__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector bool long long __b) {
+  return __builtin_altivec_vmaxsd(__a, (vector signed long long)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud((vector unsigned long long)__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vmaxud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_max(vector float __a,
+                                                    vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_max(vector double __a,
+                                                     vector double __b) {
+  return __builtin_vsx_xvmaxdp(__a, __b);
+}
+#endif
+
+/* vec_vmaxsb */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+/* vec_vmaxub */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vmaxsh */
+
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector bool short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                                       vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+/* vec_vmaxuh */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vmaxsw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector bool int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector int __a,
+                                                     vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+/* vec_vmaxuw */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vmaxfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vmaxfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+/* vec_mergeh */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mergeh(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_mergeh(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mergeh(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_mergeh(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_mergeh(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mergeh(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergeh(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_mergeh(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                                        vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeh(vector double __a, vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+#endif
+
+/* vec_vmrghb */
+
+#define __builtin_altivec_vmrghb vec_vmrghb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmrghb(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmrghb(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vmrghb(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+/* vec_vmrghh */
+
+#define __builtin_altivec_vmrghh vec_vmrghh
+
+static __inline__ vector short __ATTRS_o_ai vec_vmrghh(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmrghh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vmrghh(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vmrghh(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+/* vec_vmrghw */
+
+#define __builtin_altivec_vmrghw vec_vmrghw
+
+static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmrghw(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vmrghw(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vmrghw(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_mergel */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mergel(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_mergel(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mergel(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_mergel(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_mergel(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mergel(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergel(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_mergel(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector double __ATTRS_o_ai vec_mergel(vector double __a,
+                                                        vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+#endif
+
+/* vec_vmrglb */
+
+#define __builtin_altivec_vmrglb vec_vmrglb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmrglb(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmrglb(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vmrglb(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+/* vec_vmrglh */
+
+#define __builtin_altivec_vmrglh vec_vmrglh
+
+static __inline__ vector short __ATTRS_o_ai vec_vmrglh(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmrglh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vmrglh(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vmrglh(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+/* vec_vmrglw */
+
+#define __builtin_altivec_vmrglw vec_vmrglw
+
+static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmrglw(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vmrglw(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vmrglw(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#ifdef __POWER8_VECTOR__
+/* vec_mergee */
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergee(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mergee(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergee(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergee(vector bool long long __a, vector bool long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergee(vector signed long long __a, vector signed long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergee(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergee(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergee(vector double __a, vector double __b) {
+  return vec_mergeh(__a, __b);
+}
+
+/* vec_mergeo */
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergeo(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mergeo(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergeo(vector bool long long __a, vector bool long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeo(vector signed long long __a, vector signed long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeo(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergeo(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeo(vector double __a, vector double __b) {
+  return vec_mergel(__a, __b);
+}
+
+#endif
+
+/* vec_mfvscr */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_mfvscr(void) {
+  return __builtin_altivec_mfvscr();
+}
+
+/* vec_min */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_min(vector short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_min(vector bool short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_min(vector short __a,
+                                                    vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_min(vector int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_min(vector bool int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_min(vector int __a,
+                                                  vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd((vector signed long long)__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector bool long long __b) {
+  return __builtin_altivec_vminsd(__a, (vector signed long long)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud((vector unsigned long long)__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vminud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_min(vector float __a,
+                                                    vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_min(vector double __a,
+                                                     vector double __b) {
+  return __builtin_vsx_xvmindp(__a, __b);
+}
+#endif
+
+/* vec_vminsb */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+/* vec_vminub */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vminsh */
+
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector bool short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                                       vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+/* vec_vminuh */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vminsw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector bool int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector int __a,
+                                                     vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+/* vec_vminuw */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vminfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vminfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+/* vec_mladd */
+
+#define __builtin_altivec_vmladduhm vec_mladd
+
+static __inline__ vector short __ATTRS_o_ai vec_mladd(vector short __a,
+                                                      vector short __b,
+                                                      vector short __c) {
+  return __a * __b + __c;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mladd(
+    vector short __a, vector unsigned short __b, vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
+                                                      vector short __b,
+                                                      vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_vmladduhm */
+
+static __inline__ vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
+                                                          vector short __b,
+                                                          vector short __c) {
+  return __a * __b + __c;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vmladduhm(
+    vector short __a, vector unsigned short __b, vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector short __b, vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_mradds */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_mradds(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_vmhraddshs */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vmhraddshs(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_msum */
+
+static __inline__ vector int __ATTRS_o_ai vec_msum(vector signed char __a,
+                                                   vector unsigned char __b,
+                                                   vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_msum(vector short __a,
+                                                   vector short __b,
+                                                   vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_vmsummbm */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+/* vec_vmsumubm */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmsumubm(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+/* vec_vmsumshm */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmsumshm(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+/* vec_vmsumuhm */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhm(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_msums */
+
+static __inline__ vector int __ATTRS_o_ai vec_msums(vector short __a,
+                                                    vector short __b,
+                                                    vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msums(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_vmsumshs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmsumshs(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+/* vec_vmsumuhs */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhs(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_mtvscr */
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector signed char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector pixel __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector float __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+/* vec_mul */
+
+/* Integer vector multiplication will involve multiplication of the odd/even
+   elements separately, then truncating the results and moving to the
+   result vector.
+*/
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mul(vector signed char __a, vector signed char __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mul(vector unsigned char __a, vector unsigned char __b) {
+  return __a * __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mul(vector signed short __a, vector signed short __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mul(vector unsigned short __a, vector unsigned short __b) {
+  return __a * __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mul(vector signed int __a, vector signed int __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mul(vector unsigned int __a, vector unsigned int __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mul(vector signed long long __a, vector signed long long __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mul(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a * __b;
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_mul(vector float __a,
+                                                    vector float __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_mul(vector double __a,
+                                                     vector double __b) {
+  return __a * __b;
+}
+#endif
+
+/* The vmulos* and vmules* instructions have a big endian bias, so
+   we must reverse the meaning of "even" and "odd" for little endian.  */
+
+/* vec_mule */
+
+static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a,
+                                                     vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mule(vector short __a,
+                                                   vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mule(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosw(__a, __b);
+#else
+  return __builtin_altivec_vmulesw(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouw(__a, __b);
+#else
+  return __builtin_altivec_vmuleuw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulesb */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vmulesb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+/* vec_vmuleub */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vmuleub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+/* vec_vmulesh */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmulesh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+/* vec_vmuleuh */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_mulo */
+
+static __inline__ vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
+                                                     vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mulo(vector short __a,
+                                                   vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mulo(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesw(__a, __b);
+#else
+  return __builtin_altivec_vmulosw(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuw(__a, __b);
+#else
+  return __builtin_altivec_vmulouw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulosb */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vmulosb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+/* vec_vmuloub */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vmuloub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+/* vec_vmulosh */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmulosh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+/* vec_vmulouh */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmulouh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/*  vec_nand */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector signed char __a, vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector signed char __a, vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector bool char __a, vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector unsigned char __a, vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector bool char __a, vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                         vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector signed short __a, vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector signed short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector bool short __a, vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nand(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nand(vector unsigned short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_nand(vector bool short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_nand(vector signed int __a, vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                                          vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_nand(vector bool int __a, vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector unsigned int __a, vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector bool int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                                        vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_nand(vector float __a, vector float __b) {
+  return (vector float)(~((vector unsigned int)__a &
+                          (vector unsigned int)__b));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_nand(vector double __a, vector double __b) {
+  return (vector double)(~((vector unsigned long long)__a &
+                           (vector unsigned long long)__b));
+}
+
+#endif
+
+/* vec_nmadd */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_nmadd(vector float __a,
+                                                      vector float __b,
+                                                      vector float __c) {
+  return __builtin_vsx_xvnmaddasp(__a, __b, __c);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_nmadd(vector double __a,
+                                                       vector double __b,
+                                                       vector double __c) {
+  return __builtin_vsx_xvnmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_nmsub */
+
+static __inline__ vector float __ATTRS_o_ai vec_nmsub(vector float __a,
+                                                      vector float __b,
+                                                      vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvnmsubasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_nmsub(vector double __a,
+                                                       vector double __b,
+                                                       vector double __c) {
+  return __builtin_vsx_xvnmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vnmsubfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vnmsubfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_nor */
+
+#define __builtin_altivec_vnor vec_nor
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_nor(vector bool char __a,
+                                                        vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_nor(vector short __a,
+                                                    vector short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_nor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_nor(vector int __a,
+                                                  vector int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_nor(vector bool int __a,
+                                                       vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_nor(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_nor(vector double __a,
+                                                     vector double __b) {
+  vector unsigned long long __res =
+      ~((vector unsigned long long)__a | (vector unsigned long long)__b);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vnor */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vnor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vnor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vnor(vector bool char __a,
+                                                         vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vnor(vector short __a,
+                                                     vector short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vnor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vnor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vnor(vector int __a,
+                                                   vector int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vnor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vnor(vector bool int __a,
+                                                        vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vnor(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_nor(vector bool long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+#endif
+
+/* vec_or */
+
+#define __builtin_altivec_vor vec_or
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector signed char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_or(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
+                                                         vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector unsigned char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_or(vector bool char __a,
+                                                       vector bool char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_or(vector short __a,
+                                                   vector short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                   vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_or(vector short __a,
+                                                   vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector unsigned short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                        vector bool short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_or(vector int __a,
+                                                 vector int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_or(vector bool int __a,
+                                                 vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_or(vector int __a,
+                                                 vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector unsigned int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_or(vector bool int __a,
+                                                      vector bool int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
+                                                   vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_or(vector bool int __a,
+                                                   vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
+                                                   vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_or(vector bool long long __a,
+                                                    vector double __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
+                                                    vector bool long long __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
+                                                    vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_or(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_or(vector signed long long __a, vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector signed char __a, vector signed char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector signed char __a, vector bool char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector bool char __a, vector signed char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector unsigned char __a, vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector unsigned char __a, vector bool char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector bool char __a, vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector signed short __a, vector signed short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector signed short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector signed short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector unsigned short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector unsigned short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_orc(vector signed int __a, vector signed int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                                         vector bool int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_orc(vector bool int __a, vector signed int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector unsigned int __a, vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector unsigned int __a, vector bool int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector bool int __a, vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector bool int __a, vector float __b) {
+ return (vector float)(__a | ~(vector unsigned int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector float __a, vector bool int __b) {
+  return (vector float)((vector unsigned int)__a | ~__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a | ~__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector double __b) {
+  return (vector double)(__a | ~(vector unsigned long long)__b);
+}
+#endif
+
+/* vec_vor */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector signed char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector unsigned char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                                    vector short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                                    vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector unsigned short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector bool short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector int __a,
+                                                  vector int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector int __a,
+                                                  vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector unsigned int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                                    vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vor(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vor(vector signed long long __a, vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+/* vec_pack */
+
+/* The various vector pack instructions have a big-endian bias, so for
+   little endian we must handle reversed element numbering.  */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_pack(vector signed short __a, vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_pack(vector bool short __a, vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_pack(vector int __a,
+                                                     vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_pack(vector bool int __a,
+                                                          vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector signed int __ATTRS_o_ai
+vec_pack(vector signed long long __a, vector signed long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_pack(vector bool long long __a, vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_pack(vector double __a, vector double __b) {
+  return (vector float) (__a[0], __a[1], __b[0], __b[1]);
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack_to_short_fp32(vector float __a, vector float __b) {
+  vector float __resa = __builtin_vsx_xvcvsphp(__a);
+  vector float __resb = __builtin_vsx_xvcvsphp(__b);
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_mergee(__resa, __resb);
+#else
+  return (vector unsigned short)vec_mergeo(__resa, __resb);
+#endif
+}
+
+#endif
+/* vec_vpkuhum */
+
+#define __builtin_altivec_vpkuhum vec_vpkuhum
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vpkuhum(vector signed short __a, vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vpkuhum(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vpkuhum(vector bool short __a, vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+/* vec_vpkuwum */
+
+#define __builtin_altivec_vpkuwum vec_vpkuwum
+
+static __inline__ vector short __ATTRS_o_ai vec_vpkuwum(vector int __a,
+                                                        vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkuwum(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vpkuwum(vector bool int __a, vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_vpkudum */
+
+#ifdef __POWER8_VECTOR__
+#define __builtin_altivec_vpkudum vec_vpkudum
+
+static __inline__ vector int __ATTRS_o_ai vec_vpkudum(vector long long __a,
+                                                      vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vpkudum(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vpkudum(vector bool long long __a, vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+#endif
+
+/* vec_packpx */
+
+static __inline__ vector pixel __attribute__((__always_inline__))
+vec_packpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_vpkpx */
+
+static __inline__ vector pixel __attribute__((__always_inline__))
+vec_vpkpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_packs */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a,
+                                                            vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_packs(vector int __a,
+                                                             vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector int __ATTRS_o_ai vec_packs(vector long long __a,
+                                                    vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshss */
+
+static __inline__ vector signed char __attribute__((__always_inline__))
+vec_vpkshss(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+/* vec_vpksdss */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector int __ATTRS_o_ai vec_vpksdss(vector long long __a,
+                                                      vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkuhus */
+
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+vec_vpkuhus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkudus */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vpkudus(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkswss */
+
+static __inline__ vector signed short __attribute__((__always_inline__))
+vec_vpkswss(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+/* vec_vpkuwus */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vpkuwus(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_packsu */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packsu(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packsu(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_packsu(vector long long __a, vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshus */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswus */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpksdus */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vpksdus(vector long long __a, vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_perm */
+
+// The vperm instruction is defined architecturally with a big-endian bias.
+// For little endian, we swap the input operands and invert the permute
+// control vector.  Only the rightmost 5 bits matter, so we could use
+// a vector of all 31s instead of all 255s to perform the inversion.
+// However, when the PCV is not a constant, using 255 has an advantage
+// in that the vec_xor can be recognized as a vec_nor (and for P8 and
+// later, possibly a vec_nand).
+
+static __inline__ vector signed char __ATTRS_o_ai vec_perm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                         (vector int)__a, __d);
+#else
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                         (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                       (vector int)__a, __d);
+#else
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                       (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                                     vector signed short __b,
+                                                     vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_perm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                        (vector int)__a, __d);
+#else
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                        (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_perm(vector pixel __a,
+                                                     vector pixel __b,
+                                                     vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                                   vector signed int __b,
+                                                   vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed int)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector signed int)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                      (vector int)__a, __d);
+#else
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                      (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_perm(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector long long __ATTRS_o_ai
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_perm(vector double __a, vector double __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__b,
+                                                    (vector int)__a, __d);
+#else
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__a,
+                                                    (vector int)__b, __c);
+#endif
+}
+#endif
+
+/* vec_vperm */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_vperm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vperm(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vperm(
+    vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vperm(vector short __a, vector short __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vperm(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_vperm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai
+vec_vperm(vector pixel __a, vector pixel __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vperm(vector int __a,
+                                                    vector int __b,
+                                                    vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vperm(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vperm(vector bool int __a, vector bool int __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_vperm(vector float __a, vector float __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+#ifdef __VSX__
+static __inline__ vector long long __ATTRS_o_ai vec_vperm(
+    vector long long __a, vector long long __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vperm(vector unsigned long long __a, vector unsigned long long __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_vperm(vector double __a, vector double __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+#endif
+
+/* vec_re */
+
+static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvresp(__a);
+#else
+  return __builtin_altivec_vrefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_re(vector double __a) {
+  return __builtin_vsx_xvredp(__a);
+}
+#endif
+
+/* vec_vrefp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrefp(vector float __a) {
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_rl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_rl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_rl(vector short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_rl(vector int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+#endif
+
+/* vec_rlmi */
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlmi(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vrlwmi(__a, __c, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  return __builtin_altivec_vrldmi(__a, __c, __b);
+}
+
+/* vec_rlnm */
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlnm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int OneByte = { 0x8, 0x8, 0x8, 0x8 };
+  return __builtin_altivec_vrlwnm(__a, ((__c << OneByte) | __b));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  vector unsigned long long OneByte = { 0x8, 0x8 };
+  return __builtin_altivec_vrldnm(__a, ((__c << OneByte) | __b));
+}
+#endif
+
+/* vec_vrlb */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vrlb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vrlb(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+/* vec_vrlh */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vrlh(vector short __a, vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vrlh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+/* vec_vrlw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vrlw(vector int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vrlw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_round */
+
+static __inline__ vector float __ATTRS_o_ai vec_round(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspi(__a);
+#else
+  return __builtin_altivec_vrfin(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+
+/* vec_rint */
+
+static __inline__ vector float __ATTRS_o_ai vec_rint(vector float __a) {
+  return __builtin_vsx_xvrspic(__a);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_rint(vector double __a) {
+  return __builtin_vsx_xvrdpic(__a);
+}
+
+/* vec_nearbyint */
+
+static __inline__ vector float __ATTRS_o_ai vec_nearbyint(vector float __a) {
+  return __builtin_vsx_xvrspi(__a);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_nearbyint(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+#endif
+
+/* vec_vrfin */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfin(vector float __a) {
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_sqrt */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_sqrt(vector float __a) {
+  return __builtin_vsx_xvsqrtsp(__a);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sqrt(vector double __a) {
+  return __builtin_vsx_xvsqrtdp(__a);
+}
+#endif
+
+/* vec_rsqrte */
+
+static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrsqrtesp(__a);
+#else
+  return __builtin_altivec_vrsqrtefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_rsqrte(vector double __a) {
+  return __builtin_vsx_xvrsqrtedp(__a);
+}
+#endif
+
+/* vec_vrsqrtefp */
+
+static __inline__ __vector float __attribute__((__always_inline__))
+vec_vrsqrtefp(vector float __a) {
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_sel */
+
+#define __builtin_altivec_vsel_4si vec_sel
+
+static __inline__ vector signed char __ATTRS_o_ai vec_sel(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai vec_sel(
+    vector unsigned char __a, vector unsigned char __b, vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
+                                                        vector bool char __b,
+                                                        vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sel(vector short __a,
+                                                    vector short __b,
+                                                    vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sel(vector short __a,
+                                                    vector short __b,
+                                                    vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_sel(
+    vector bool short __a, vector bool short __b, vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sel(vector int __a,
+                                                  vector int __b,
+                                                  vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sel(vector int __a,
+                                                  vector int __b,
+                                                  vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sel(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
+                                                       vector bool int __b,
+                                                       vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sel(vector float __a,
+                                                    vector float __b,
+                                                    vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sel(vector float __a,
+                                                    vector float __b,
+                                                    vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                           ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                           ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vsel */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_vsel(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai vec_vsel(
+    vector unsigned char __a, vector unsigned char __b, vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
+                                                         vector bool char __b,
+                                                         vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsel(vector short __a,
+                                                     vector short __b,
+                                                     vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b,
+         vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_vsel(
+    vector bool short __a, vector bool short __b, vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsel(vector int __a,
+                                                   vector int __b,
+                                                   vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsel(vector int __a,
+                                                   vector int __b,
+                                                   vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsel(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsel(
+    vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
+                                                        vector bool int __b,
+                                                        vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
+                                                     vector float __b,
+                                                     vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_sl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b) {
+  return __a << (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sl(vector unsigned char __a, vector unsigned char __b) {
+  return __a << __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
+                                                   vector unsigned short __b) {
+  return __a << (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sl(vector unsigned short __a, vector unsigned short __b) {
+  return __a << __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
+                                                 vector unsigned int __b) {
+  return __a << (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sl(vector unsigned int __a, vector unsigned int __b) {
+  return __a << __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sl(vector signed long long __a, vector unsigned long long __b) {
+  return __a << (vector long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a << __b;
+}
+#endif
+
+/* vec_vslb */
+
+#define __builtin_altivec_vslb vec_vslb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslb(vector signed char __a, vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslb(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslh */
+
+#define __builtin_altivec_vslh vec_vslh
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vslh(vector short __a, vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslw */
+
+#define __builtin_altivec_vslw vec_vslw
+
+static __inline__ vector int __ATTRS_o_ai vec_vslw(vector int __a,
+                                                   vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslw(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_sld */
+
+#define __builtin_altivec_vsldoi_4si vec_sld
+
+static __inline__ vector signed char __ATTRS_o_ai vec_sld(
+    vector signed char __a, vector signed char __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sld(vector unsigned char __a, vector unsigned char __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sld(vector bool char __a, vector bool char __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_sld(
+    vector signed short __a, vector signed short __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sld(vector unsigned short __a, vector unsigned short __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sld(vector bool short __a, vector bool short __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sld(vector pixel __a,
+                                                    vector pixel __b,
+                                                    unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int __a, vector signed int __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sld(
+    vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_sld(vector bool int __a,
+                                                       vector bool int __b,
+                                                       unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sld(vector float __a,
+                                                    vector float __b,
+                                                    unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_sld(vector bool long long __a, vector bool long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sld(vector signed long long __a, vector signed long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sld(vector unsigned long long __a, vector unsigned long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sld(vector double __a,
+                                                     vector double __b,
+                                                     unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+#endif
+
+/* vec_sldw */
+static __inline__ vector signed char __ATTRS_o_ai vec_sldw(
+    vector signed char __a, vector signed char __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sldw(vector unsigned char __a, vector unsigned char __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_sldw(
+    vector signed short __a, vector signed short __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sldw(vector unsigned short __a, vector unsigned short __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sldw(vector signed int __a, vector signed int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sldw(
+    vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sldw(vector signed long long __a, vector signed long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+/* vec_slv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vslv(__a, __b);
+}
+
+/* vec_srv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsrv(__a, __b);
+}
+#endif
+
+/* vec_vsldoi */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsldoi(vector signed char __a, vector signed char __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai vec_vsldoi(
+    vector unsigned char __a, vector unsigned char __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsldoi(vector short __a,
+                                                       vector short __b,
+                                                       unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai vec_vsldoi(
+    vector unsigned short __a, vector unsigned short __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsldoi(vector pixel __a,
+                                                       vector pixel __b,
+                                                       unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsldoi(vector int __a,
+                                                     vector int __b,
+                                                     unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsldoi(
+    vector unsigned int __a, vector unsigned int __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsldoi(vector float __a,
+                                                       vector float __b,
+                                                       unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+/* vec_sll */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsl((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsl((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
+/* vec_vsl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_slo */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                                    vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                                    vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_slo(vector int __a,
+                                                  vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_slo(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                                    vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                                    vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
+/* vec_vslo */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                                     vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                                     vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                                     vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                                     vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                                   vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                                   vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                                     vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                                     vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_splat */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_splat(vector signed char __a, unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splat(vector unsigned char __a, unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_splat(vector bool char __a, unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_splat(vector signed short __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splat(vector unsigned short __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_splat(vector bool short __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_splat(vector pixel __a,
+                                                      unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_splat(vector signed int __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splat(vector unsigned int __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_splat(vector bool int __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_splat(vector float __a,
+                                                      unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_splat(vector double __a,
+                                                       unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_splat(vector bool long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_splat(vector signed long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_splat(vector unsigned long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+#endif
+
+/* vec_vspltb */
+
+#define __builtin_altivec_vspltb vec_vspltb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vspltb(vector signed char __a, unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vspltb(vector unsigned char __a, unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vspltb(vector bool char __a,
+                                                           unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+/* vec_vsplth */
+
+#define __builtin_altivec_vsplth vec_vsplth
+
+static __inline__ vector short __ATTRS_o_ai vec_vsplth(vector short __a,
+                                                       unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsplth(vector unsigned short __a, unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsplth(vector bool short __a, unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsplth(vector pixel __a,
+                                                       unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+/* vec_vspltw */
+
+#define __builtin_altivec_vspltw vec_vspltw
+
+static __inline__ vector int __ATTRS_o_ai vec_vspltw(vector int __a,
+                                                     unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vspltw(vector unsigned int __a, unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vspltw(vector bool int __a,
+                                                          unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vspltw(vector float __a,
+                                                       unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_splat_s8 */
+
+#define __builtin_altivec_vspltisb vec_splat_s8
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector signed char __ATTRS_o_ai
+vec_splat_s8(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_vspltisb */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vspltisb(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_splat_s16 */
+
+#define __builtin_altivec_vspltish vec_splat_s16
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_vspltish */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector short __ATTRS_o_ai vec_vspltish(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_splat_s32 */
+
+#define __builtin_altivec_vspltisw vec_splat_s32
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_vspltisw */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector int __ATTRS_o_ai vec_vspltisw(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_splat_u8 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splat_u8(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+/* vec_splat_u16 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splat_u16(signed char __a) {
+  return (vector unsigned short)(__a);
+}
+
+/* vec_splat_u32 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splat_u32(signed char __a) {
+  return (vector unsigned int)(__a);
+}
+
+/* vec_sr */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sr(vector signed char __a, vector unsigned char __b) {
+  vector unsigned char __res = (vector unsigned char)__a >> __b;
+  return (vector signed char)__res;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sr(vector unsigned char __a, vector unsigned char __b) {
+  return __a >> __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_sr(vector signed short __a, vector unsigned short __b) {
+  vector unsigned short __res = (vector unsigned short)__a >> __b;
+  return (vector signed short)__res;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sr(vector unsigned short __a, vector unsigned short __b) {
+  return __a >> __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sr(vector signed int __a, vector unsigned int __b) {
+  vector unsigned int __res = (vector unsigned int)__a >> __b;
+  return (vector signed int)__res;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sr(vector unsigned int __a, vector unsigned int __b) {
+  return __a >> __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sr(vector signed long long __a, vector unsigned long long __b) {
+  vector unsigned long long __res = (vector unsigned long long)__a >> __b;
+  return (vector signed long long)__res;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+#endif
+
+/* vec_vsrb */
+
+#define __builtin_altivec_vsrb vec_vsrb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsrb(vector signed char __a, vector unsigned char __b) {
+  return __a >> (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsrb(vector unsigned char __a, vector unsigned char __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrh */
+
+#define __builtin_altivec_vsrh vec_vsrh
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vsrh(vector short __a, vector unsigned short __b) {
+  return __a >> (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsrh(vector unsigned short __a, vector unsigned short __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrw */
+
+#define __builtin_altivec_vsrw vec_vsrw
+
+static __inline__ vector int __ATTRS_o_ai vec_vsrw(vector int __a,
+                                                   vector unsigned int __b) {
+  return __a >> (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsrw(vector unsigned int __a, vector unsigned int __b) {
+  return __a >> __b;
+}
+
+/* vec_sra */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sra(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sra(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sra(vector short __a,
+                                                    vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sra(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sra(vector int __a,
+                                                  vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sra(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sra(vector signed long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)((vector signed long long)__a >> __b);
+}
+#endif
+
+/* vec_vsrab */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsrab(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsrab(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+/* vec_vsrah */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vsrah(vector short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsrah(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+/* vec_vsraw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vsraw(vector int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsraw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_srl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsr((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsr((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
+/* vec_vsr */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_sro */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                                    vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                                    vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sro(vector int __a,
+                                                  vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sro(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                                    vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                                    vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
+/* vec_vsro */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                                     vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                                     vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                                     vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                                     vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                                   vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                                   vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                                     vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                                     vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_st */
+
+static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                           vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                           signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                           vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                           unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector short __a, int __b,
+                                           vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector short __a, int __b,
+                                           short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                           vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                           unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector int __a, int __b,
+                                           vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                           vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                           unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector float __a, int __b,
+                                           vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector float __a, int __b,
+                                           float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_stvx */
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                             vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                             signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                             vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                             unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                             vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                             short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                             vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                             unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector int __a, int __b,
+                                             vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector int __a, int __b,
+                                             int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                             vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                             unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                             vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                             float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_ste */
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector signed char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                            int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector float __a, int __b,
+                                            float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stvebx */
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector signed char __a, int __b,
+                                               signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                               signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                               unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+/* vec_stvehx */
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector short __a, int __b,
+                                               short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                               short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                               short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+/* vec_stvewx */
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector int __a, int __b,
+                                               int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                               int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector float __a, int __b,
+                                               float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stl */
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                            vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                            vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector short __a, int __b,
+                                            vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                            vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector int __a, int __b,
+                                            vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                            vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector float __a, int __b,
+                                            vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector float __a, int __b,
+                                            float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_stvxl */
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                              vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                              signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                              vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                              short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector int __a, int __b,
+                                              vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector int __a, int __b,
+                                              int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                              vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                              float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_sub */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector signed char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector unsigned char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                                    vector short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                                    vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector unsigned short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector int __a,
+                                                  vector int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector int __a,
+                                                  vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector unsigned int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sub(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sub(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sub(vector signed long long __a, vector signed long long __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sub(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a - __b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sub(vector double __a,
+                                                     vector double __b) {
+  return __a - __b;
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_sub(vector float __a,
+                                                    vector float __b) {
+  return __a - __b;
+}
+
+/* vec_vsububm */
+
+#define __builtin_altivec_vsububm vec_vsububm
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector signed char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector unsigned char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+/* vec_vsubuhm */
+
+#define __builtin_altivec_vsubuhm vec_vsubuhm
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                                        vector short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector bool short __a,
+                                                        vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                                        vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+/* vec_vsubuwm */
+
+#define __builtin_altivec_vsubuwm vec_vsubuwm
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                                      vector int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
+                                                      vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                                      vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector unsigned int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+/* vec_vsubfp */
+
+#define __builtin_altivec_vsubfp vec_vsubfp
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vsubfp(vector float __a, vector float __b) {
+  return __a - __b;
+}
+
+/* vec_subc */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subc(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vsubcuw((vector unsigned int)__a,
+                                                      (vector unsigned int) __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_subc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_subc(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vsubcuw */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vsubcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_subs */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector bool short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector bool int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vsubsbs */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vsububs */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vsubshs */
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector bool short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                                        vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+/* vec_vsubuhs */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vsubsws */
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector bool int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                                      vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+/* vec_vsubuws */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vsubuqm */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubuqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+
+/* vec_vsubeuqm */
+
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sube(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sube(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+/* vec_vsubcuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+/* vec_vsubecuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subec(vector signed int __a, vector signed int __b,
+             vector signed int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subec(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_subec(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_subec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sube(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sube(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
+/* vec_sum4s */
+
+static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
+                                                    vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sum4s(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed short __a,
+                                                    vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_vsum4sbs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vsum4sbs(vector signed char __a, vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+/* vec_vsum4ubs */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+/* vec_vsum4shs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vsum4shs(vector signed short __a, vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_sum2s */
+
+/* The vsum2sws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian elements
+   1 and 3 (little-endian element 0 and 2).  For ease of porting the
+   programmer wants elements 1 and 3 in both cases, so for little
+   endian we must perform some permutes.  */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_sum2s(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_vsum2sws */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_vsum2sws(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_sums */
+
+/* The vsumsws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian element 3
+   (little-endian element 0).  For ease of porting the programmer
+   wants element 3 in both cases, so for little endian we must perform
+   some permutes.  */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_sums(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_vsumsws */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_vsumsws(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_trunc */
+
+static __inline__ vector float __ATTRS_o_ai vec_trunc(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspiz(__a);
+#else
+  return __builtin_altivec_vrfiz(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_trunc(vector double __a) {
+  return __builtin_vsx_xvrdpiz(__a);
+}
+#endif
+
+/* vec_vrfiz */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfiz(vector float __a) {
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_unpackh */
+
+/* The vector unpack instructions all have a big-endian bias, so for
+   little endian we must reverse the meanings of "high" and "low."  */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_unpackh(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_unpackh(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_unpackh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_unpackh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unpackh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_unpackh(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_unpackh(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackh(vector float __a) {
+  return (vector double)(__a[0], __a[1]);
+}
+#endif
+
+/* vec_vupkhsb */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vupkhsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vupkhsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+/* vec_vupkhsh */
+
+static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vupkhsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vupkhsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_vupkhsw */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_vupkhsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vupkhsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_unpackl */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_unpackl(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_unpackl(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_unpackl(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_unpackl(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unpackl(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_unpackl(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_unpackl(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackl(vector float __a) {
+  return (vector double)(__a[2], __a[3]);
+}
+#endif
+
+/* vec_vupklsb */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vupklsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vupklsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+/* vec_vupklsh */
+
+static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vupklsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vupklsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vupklsw */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_vupklsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vupklsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vsx_ld */
+
+#ifdef __VSX__
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsx_ld(int __a,
+                                                       const float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed long long *__b) {
+  return (vector signed long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned long long *__b) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed short *__b) {
+  return (vector signed short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed short *__b) {
+  return (vector signed short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+#endif
+
+/* vec_vsx_st */
+
+#ifdef __VSX__
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               vector bool int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                               vector signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                               signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                               vector float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                               float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed long long __a,
+                                               int __b,
+                                               vector signed long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned long long __a,
+                                               int __b,
+                                               vector unsigned long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                               vector double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                               double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed short __a, int __b,
+                                               vector signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed short __a, int __b,
+                                               signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed char __a, int __b,
+                                               vector signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed char __a, int __b,
+                                               signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+#endif
+
+/* vec_xor */
+
+#define __builtin_altivec_vxor vec_xor
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector signed char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                                    vector short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                                    vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector bool short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector int __a,
+                                                  vector int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector int __a,
+                                                  vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                                    vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xor(vector signed long long __a, vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector bool long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xor(vector double __a,
+                                                     vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xor(vector bool long long __a,
+                                                     vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+#endif
+
+/* vec_vxor */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector signed char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                                         vector bool char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                                     vector short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                                     vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                                     vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector bool short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector int __a,
+                                                   vector int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                   vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector int __a,
+                                                   vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                        vector bool int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                                     vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector bool long long __b) {
+  return __a ^ __b;
+}
+#endif
+
+/* ------------------------ extensions for CBEA ----------------------------- */
+
+/* vec_extract */
+
+static __inline__ signed char __ATTRS_o_ai vec_extract(vector signed char __a,
+                                                       int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned char __ATTRS_o_ai
+vec_extract(vector unsigned char __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
+                                                         int __b) {
+  return __a[__b];
+}
+
+static __inline__ signed short __ATTRS_o_ai vec_extract(vector signed short __a,
+                                                        int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned short __ATTRS_o_ai
+vec_extract(vector unsigned short __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
+                                                          int __b) {
+  return __a[__b];
+}
+
+static __inline__ signed int __ATTRS_o_ai vec_extract(vector signed int __a,
+                                                      int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a,
+                                                        int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector bool int __a,
+                                                        int __b) {
+  return __a[__b];
+}
+
+#ifdef __VSX__
+static __inline__ signed long long __ATTRS_o_ai
+vec_extract(vector signed long long __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned long long __ATTRS_o_ai
+vec_extract(vector unsigned long long __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned long long __ATTRS_o_ai
+vec_extract(vector bool long long __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ double __ATTRS_o_ai vec_extract(vector double __a, int __b) {
+  return __a[__b];
+}
+#endif
+
+static __inline__ float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
+  return __a[__b];
+}
+
+#ifdef __POWER9_VECTOR__
+
+#define vec_insert4b __builtin_vsx_insertword
+#define vec_extract4b __builtin_vsx_extractuword
+
+/* vec_extract_exp */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_exp(vector float __a) {
+  return __builtin_vsx_xvxexpsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_exp(vector double __a) {
+  return __builtin_vsx_xvxexpdp(__a);
+}
+
+/* vec_extract_sig */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_sig(vector float __a) {
+  return __builtin_vsx_xvxsigsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_sig (vector double __a) {
+  return __builtin_vsx_xvxsigdp(__a);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shorth(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 0, -1, 1, -1, 2, -1, 3, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 0, -1, 1, -1, 2, -1, 3);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shortl(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 4, -1, 5, -1, 6, -1, 7, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 4, -1, 5, -1, 6, -1, 7);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+#endif /* __POWER9_VECTOR__ */
+
+/* vec_insert */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_insert(signed char __a, vector signed char __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_insert(unsigned char __a, vector unsigned char __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                           vector bool char __b,
+                                                           int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_insert(signed short __a, vector signed short __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector unsigned short __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector bool short __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_insert(signed int __a, vector signed int __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_insert(unsigned int __a, vector unsigned int __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                                          vector bool int __b,
+                                                          int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_insert(signed long long __a, vector signed long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
+                                                        vector double __b,
+                                                        int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_insert(float __a,
+                                                       vector float __b,
+                                                       int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+/* vec_lvlx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const vector signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector pixel *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlx(int __a, const int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlx(int __a,
+                                                   const vector int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvlxl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const vector short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const vector pixel *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlxl(int __a, const int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlxl(int __a,
+                                                    const vector int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      vector float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrx(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrx(int __a,
+                                                   const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrxl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrxl(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrxl(int __a,
+                                                    const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_stvlx */
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                              signed char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                              vector signed char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool char __a, int __b,
+                                              vector bool char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                              short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                              vector short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool short __a, int __b,
+                                              vector bool short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector pixel __a, int __b,
+                                              vector pixel *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector int __a, int __b,
+                                              int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector int __a, int __b,
+                                              vector int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool int __a, int __b,
+                                              vector bool int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector float __a, int __b,
+                                              vector float *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvlxl */
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                               signed char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                               vector signed char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                               short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                               vector short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector pixel __a, int __b,
+                                               vector pixel *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b,
+                                               int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b,
+                                               vector int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool int __a, int __b,
+                                               vector bool int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector float __a, int __b,
+                                               vector float *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrx */
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                              signed char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                              vector signed char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool char __a, int __b,
+                                              vector bool char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                              short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                              vector short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool short __a, int __b,
+                                              vector bool short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector pixel __a, int __b,
+                                              vector pixel *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector int __a, int __b,
+                                              int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector int __a, int __b,
+                                              vector int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool int __a, int __b,
+                                              vector bool int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector float __a, int __b,
+                                              vector float *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrxl */
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                               signed char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                               vector signed char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                               short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                               vector short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector pixel __a, int __b,
+                                               vector pixel *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b,
+                                               int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b,
+                                               vector int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool int __a, int __b,
+                                               vector bool int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,
+                                               vector float *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_promote */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_promote(signed char __a,
+                                                              int __b) {
+  vector signed char __res = (vector signed char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_promote(unsigned char __a, int __b) {
+  vector unsigned char __res = (vector unsigned char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
+  vector short __res = (vector short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_promote(unsigned short __a, int __b) {
+  vector unsigned short __res = (vector unsigned short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
+  vector int __res = (vector int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a,
+                                                               int __b) {
+  vector unsigned int __res = (vector unsigned int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
+  vector float __res = (vector float)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+/* vec_splats */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splats(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_splats(short __a) {
+  return (vector short)(__a);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splats(unsigned short __a) {
+  return (vector unsigned short)(__a);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_splats(int __a) {
+  return (vector int)(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splats(unsigned int __a) {
+  return (vector unsigned int)(__a);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_splats(signed long long __a) {
+  return (vector signed long long)(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_splats(unsigned long long __a) {
+  return (vector unsigned long long)(__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_splats(signed __int128 __a) {
+  return (vector signed __int128)(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_splats(unsigned __int128 __a) {
+  return (vector unsigned __int128)(__a);
+}
+
+#endif
+
+static __inline__ vector double __ATTRS_o_ai vec_splats(double __a) {
+  return (vector double)(__a);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_splats(float __a) {
+  return (vector float)(__a);
+}
+
+/* ----------------------------- predicates --------------------------------- */
+
+/* vec_all_eq */
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_ge */
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a);
+}
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_gt */
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b);
+}
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_in */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_in(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_le */
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_le(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_lt */
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_nan */
+
+static __inline__ int __ATTRS_o_ai vec_all_nan(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ, __a, __a);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_nan(vector double __a) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __a);
+}
+#endif
+
+/* vec_all_ne */
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nge */
+
+static __inline__ int __ATTRS_o_ai vec_all_nge(vector float __a,
+                                               vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_nge(vector double __a,
+                                               vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_ngt */
+
+static __inline__ int __ATTRS_o_ai vec_all_ngt(vector float __a,
+                                               vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_ngt(vector double __a,
+                                               vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nle */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_nlt */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_numeric */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
+}
+
+/* vec_any_eq */
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_ge */
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_gt */
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_le */
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_le(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_lt */
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_nan */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nan(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
+}
+
+/* vec_any_ne */
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT_REV, __a, __b);
+}
+#endif
+
+/* vec_any_nge */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nge(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_ngt */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_ngt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nle */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_nlt */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_numeric */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
+}
+
+/* vec_any_out */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_out(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* Power 8 Crypto functions
+Note: We diverge from the current GCC implementation with regard
+to cryptography and related functions as follows:
+- Only the SHA and AES instructions and builtins are disabled by -mno-crypto
+- The remaining ones are only available on Power8 and up so
+  require -mpower8-vector
+The justification for this is that export requirements require that
+Category:Vector.Crypto is optional (i.e. compliant hardware may not provide
+support). As a result, we need to be able to turn off support for those.
+The remaining ones (currently controlled by -mcrypto for GCC) still
+need to be provided on compliant hardware even if Vector.Crypto is not
+provided.
+*/
+#ifdef __CRYPTO__
+#define vec_sbox_be __builtin_altivec_crypto_vsbox
+#define vec_cipher_be __builtin_altivec_crypto_vcipher
+#define vec_cipherlast_be __builtin_altivec_crypto_vcipherlast
+#define vec_ncipher_be __builtin_altivec_crypto_vncipher
+#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vsbox(vector unsigned long long __a) {
+  return __builtin_altivec_crypto_vsbox(__a);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipher(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipher(__a, __b);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast(vector unsigned long long __a,
+                             vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipherlast(__a, __b);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipher(vector unsigned long long __a,
+                          vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipher(__a, __b);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast(vector unsigned long long __a,
+                              vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipherlast(__a, __b);
+}
+
+#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
+#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
+
+#define vec_shasigma_be(X, Y, Z)                                               \
+  _Generic((X), vector unsigned int                                            \
+           : __builtin_crypto_vshasigmaw, vector unsigned long long            \
+           : __builtin_crypto_vshasigmad)((X), (Y), (Z))
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool char __ATTRS_o_ai
+vec_permxor(vector bool char __a, vector bool char __b,
+            vector bool char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_permxor(vector signed char __a, vector signed char __b,
+            vector signed char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_permxor(vector unsigned char __a, vector unsigned char __b,
+            vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
+                          vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned short __a, vector unsigned short __b,
+                          vector unsigned short __c) {
+  return (vector unsigned short)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai __builtin_crypto_vpermxor(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (vector unsigned int)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned long long __a,
+                          vector unsigned long long __b,
+                          vector unsigned long long __c) {
+  return (vector unsigned long long)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_crypto_vpmsumb(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_crypto_vpmsumh(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_crypto_vpmsumw(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vpmsumd(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vgbbd(vector signed char __a) {
+  return __builtin_altivec_vgbbd((vector unsigned char)__a);
+}
+
+#define vec_pmsum_be __builtin_crypto_vpmsumb
+#define vec_gb __builtin_altivec_vgbbd
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vgbbd(vector unsigned char __a) {
+  return __builtin_altivec_vgbbd(__a);
+}
+
+static __inline__ vector long long __ATTRS_o_ai
+vec_vbpermq(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char)__a,
+                                   (vector unsigned char)__b);
+}
+
+static __inline__ vector long long __ATTRS_o_ai
+vec_vbpermq(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq(__a, __b);
+}
+
+#ifdef __powerpc64__
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char)__a,
+                                   (vector unsigned char)__b);
+}
+#endif
+#endif
+
+
+/* vec_reve */
+
+static inline __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed char vec_reve(vector signed char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_reve(vector unsigned char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_reve(vector signed int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_reve(vector unsigned int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool short vec_reve(vector bool short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_reve(vector signed short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_reve(vector unsigned short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector float vec_reve(vector float __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector bool long long
+vec_reve(vector bool long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_reve(vector signed long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_reve(vector unsigned long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector double vec_reve(vector double __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+#endif
+
+/* vec_revb */
+static __inline__ vector bool char __ATTRS_o_ai
+vec_revb(vector bool char __a) {
+  return __a;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_revb(vector signed char __a) {
+  return __a;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_revb(vector unsigned char __a) {
+  return __a;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_revb(vector bool short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_revb(vector signed short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_revb(vector unsigned short __a) {
+  vector unsigned char __indices =
+     { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_revb(vector bool int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_revb(vector signed int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_revb(vector unsigned int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_revb(vector float __a) {
+ vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_revb(vector bool long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_revb(vector signed long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_revb(vector unsigned long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_revb(vector double __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+#endif /* End __VSX__ */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_revb(vector signed __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector signed __int128)vec_perm((vector signed int)__a,
+                                          (vector signed int)__a,
+                                           __indices);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_revb(vector unsigned __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector unsigned __int128)vec_perm((vector signed int)__a,
+                                            (vector signed int)__a,
+                                             __indices);
+}
+#endif /* END __POWER8_VECTOR__ && __powerpc64__ */
+
+/* vec_xl */
+
+static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
+                                                     signed char *__ptr) {
+  return *(vector signed char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(signed long long __offset, unsigned char *__ptr) {
+  return *(vector unsigned char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
+                                                      signed short *__ptr) {
+  return *(vector signed short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(signed long long __offset, unsigned short *__ptr) {
+  return *(vector unsigned short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
+                                                    signed int *__ptr) {
+  return *(vector signed int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
+                                                      unsigned int *__ptr) {
+  return *(vector unsigned int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
+                                               float *__ptr) {
+  return *(vector float *)(__ptr + __offset);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(signed long long __offset, signed long long *__ptr) {
+  return *(vector signed long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(signed long long __offset, unsigned long long *__ptr) {
+  return *(vector unsigned long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
+                                                double *__ptr) {
+  return *(vector double *)(__ptr + __offset);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai vector signed __int128
+vec_xl(signed long long __offset, signed __int128 *__ptr) {
+  return *(vector signed __int128 *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned __int128
+vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
+  return *(vector unsigned __int128 *)(__ptr + __offset);
+}
+#endif
+
+/* vec_xl_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed char *__ptr) {
+  vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned char *__ptr) {
+  vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector signed short  __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed short *__ptr) {
+  vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned short *__ptr) {
+  vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed int *__ptr) {
+  return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned int *__ptr) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, float *__ptr) {
+  return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed long long *__ptr) {
+  return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned long long *__ptr) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, double *__ptr) {
+  return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+#endif
+#else
+  #define vec_xl_be vec_xl
+#endif
+
+/* vec_xst */
+
+static inline __ATTRS_o_ai void vec_xst(vector signed char __vec,
+                                        signed long long __offset,
+                                        signed char *__ptr) {
+  *(vector signed char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec,
+                                        signed long long __offset,
+                                        unsigned char *__ptr) {
+  *(vector unsigned char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed short __vec,
+                                        signed long long __offset,
+                                        signed short *__ptr) {
+  *(vector signed short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec,
+                                        signed long long __offset,
+                                        unsigned short *__ptr) {
+  *(vector unsigned short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed int __vec,
+                                        signed long long __offset,
+                                        signed int *__ptr) {
+  *(vector signed int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec,
+                                        signed long long __offset,
+                                        unsigned int *__ptr) {
+  *(vector unsigned int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector float __vec,
+                                        signed long long __offset,
+                                        float *__ptr) {
+  *(vector float *)(__ptr + __offset) = __vec;
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec,
+                                        signed long long __offset,
+                                        signed long long *__ptr) {
+  *(vector signed long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec,
+                                        signed long long __offset,
+                                        unsigned long long *__ptr) {
+  *(vector unsigned long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector double __vec,
+                                        signed long long __offset,
+                                        double *__ptr) {
+  *(vector double *)(__ptr + __offset) = __vec;
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec,
+                                        signed long long __offset,
+                                        signed __int128 *__ptr) {
+  *(vector signed __int128 *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
+                                        signed long long __offset,
+                                        unsigned __int128 *__ptr) {
+  *(vector unsigned __int128 *)(__ptr + __offset) = __vec;
+}
+#endif
+
+/* vec_xst_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec,
+                                               signed long long  __offset,
+                                               signed char *__ptr) {
+  vector signed char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
+                                               signed long long  __offset,
+                                               unsigned char *__ptr) {
+  vector unsigned char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
+                                               signed long long  __offset,
+                                               signed short *__ptr) {
+  vector signed short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
+                                               signed long long  __offset,
+                                               unsigned short *__ptr) {
+  vector unsigned short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
+                                               signed long long  __offset,
+                                               signed int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec,
+                                               signed long long  __offset,
+                                               unsigned int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec,
+                                               signed long long  __offset,
+                                               float *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec,
+                                               signed long long  __offset,
+                                               signed long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec,
+                                               signed long long  __offset,
+                                               unsigned long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec,
+                                               signed long long  __offset,
+                                               double *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed __int128 __vec,
+                                               signed long long  __offset,
+                                               signed __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec,
+                                               signed long long  __offset,
+                                               unsigned __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+#endif
+#else
+  #define vec_xst_be vec_xst
+#endif
+
+#ifdef __POWER9_VECTOR__
+#define vec_test_data_class(__a, __b)                                      \
+        _Generic((__a),                                                    \
+           vector float:                                                   \
+             (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)),       \
+           vector double:                                                  \
+             (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b))  \
+        )
+
+#endif /* #ifdef __POWER9_VECTOR__ */
+
+static vector float __ATTRS_o_ai vec_neg(vector float __a) {
+  return -__a;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_neg(vector double __a) {
+  return -__a;
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_neg(vector long long __a) {
+  return -__a;
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_neg(vector signed int __a) {
+  return -__a;
+}
+
+static vector signed short __ATTRS_o_ai vec_neg(vector signed short __a) {
+  return -__a;
+}
+
+static vector signed char __ATTRS_o_ai vec_neg(vector signed char __a) {
+  return -__a;
+}
+
+static vector float __ATTRS_o_ai vec_nabs(vector float __a) {
+  return - vec_abs(__a);
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_nabs(vector double __a) {
+  return - vec_abs(__a);
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_nabs(vector long long __a) {
+  return __builtin_altivec_vminsd(__a, -__a);
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_nabs(vector signed int __a) {
+  return __builtin_altivec_vminsw(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) {
+  return __builtin_altivec_vminsh(__a, -__a);
+}
+
+static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
+  return __builtin_altivec_vminsb(__a, -__a);
+}
+#undef __ATTRS_o_ai
+
+#endif /* __ALTIVEC_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/ammintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/ammintrin.h
new file mode 100644
index 000000000..2843a7a26
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/ammintrin.h
@@ -0,0 +1,193 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param x
+///    The value from which bits are extracted.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a x are extracted. If the length is zero but the
+///    index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
+///    extracted from the source operand.
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index and of the length specified by
+///    \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param __x
+///    The value from which bits are extracted.
+/// \param __y
+///    Specifies the index of the least significant bit at [13:8] and the
+///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
+///    length is interpreted as 64. If the sum of the index and length is
+///    greater than 64, the result is undefined. If the length and index are
+///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+///    from the source operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    \a y into the lower 64 bits of the destination integer vector \a x at
+///    the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length \a len and by the index \a idx specifying the
+///    least significant bit.
+/// \param y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a y of length \a len.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a x with the specified bitfields replaced by the
+///    lower bits of source operand \a y. The upper 64 bits of the return value
+///    are undefined.
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    \a __y into the lower 64 bits of the destination integer vector \a __x
+///    at the index and of the length specified by \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param __x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length and by the index of the least significant bit
+///    specified by operand \a __y.
+/// \param __y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a __y with length
+///    specified by bits [69:64]. These are inserted into the destination at the
+///    index specified by bits [77:72]; all other bits are ignored. If bits
+///    [69:64] are zero, the length is interpreted as 64. If the sum of the
+///    index and length is greater than 64, the result is undefined. If the
+///    length and index are both zero, bits [63:0] of parameter \a __y are
+///    inserted into parameter \a __x. If the length is zero but the index is
+///    non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a __x with the specified bitfields replaced by the
+///    lower bits of source operand \a __y. The upper 64 bits of the return
+///    value are undefined.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
+///
+/// \param __p
+///    The 64-bit memory location used to store the register value.
+/// \param __a
+///    The 64-bit double-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+///    memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
+///
+/// \param __p
+///    The 32-bit memory location used to store the register value.
+/// \param __a
+///    The 32-bit single-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/arm_acle.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/arm_acle.h
new file mode 100644
index 000000000..8423e62a3
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/arm_acle.h
@@ -0,0 +1,312 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 8.3 Memory barriers */
+#if !defined(_MSC_VER)
+#define __dmb(i) __builtin_arm_dmb(i)
+#define __dsb(i) __builtin_arm_dsb(i)
+#define __isb(i) __builtin_arm_isb(i)
+#endif
+
+/* 8.4 Hints */
+
+#if !defined(_MSC_VER)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
+  __builtin_arm_wfi();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
+  __builtin_arm_wfe();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
+  __builtin_arm_sev();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
+  __builtin_arm_sevl();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
+  __builtin_arm_yield();
+}
+#endif
+
+#if __ARM_32BIT_STATE
+#define __dbg(t) __builtin_arm_dbg(t)
+#endif
+
+/* 8.5 Swap */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__swp(uint32_t __x, volatile uint32_t *__p) {
+  uint32_t v;
+  do
+    v = __builtin_arm_ldrex(__p);
+  while (__builtin_arm_strex(__x, __p));
+  return v;
+}
+
+/* 8.6 Memory prefetch intrinsics */
+/* 8.6.1 Data prefetch */
+#define __pld(addr) __pldx(0, 0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, 1)
+#else
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
+#endif
+
+/* 8.6.2 Instruction prefetch */
+#define __pli(addr) __plix(0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, 0)
+#else
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
+#endif
+
+/* 8.7 NOP */
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
+  __builtin_arm_nop();
+}
+
+/* 9 DATA-PROCESSING INTRINSICS */
+/* 9.2 Miscellaneous data-processing intrinsics */
+/* ROR */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__ror(uint32_t __x, uint32_t __y) {
+  __y %= 32;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (32 - __y));
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rorll(uint64_t __x, uint32_t __y) {
+  __y %= 64;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (64 - __y));
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rorl(unsigned long __x, uint32_t __y) {
+#if __SIZEOF_LONG__ == 4
+  return __ror(__x, __y);
+#else
+  return __rorll(__x, __y);
+#endif
+}
+
+
+/* CLZ */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__clz(uint32_t __t) {
+  return __builtin_clz(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__clzl(unsigned long __t) {
+  return __builtin_clzl(__t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__clzll(uint64_t __t) {
+  return __builtin_clzll(__t);
+}
+
+/* REV */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev(uint32_t __t) {
+  return __builtin_bswap32(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__revl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(__t);
+#else
+  return __builtin_bswap64(__t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__revll(uint64_t __t) {
+  return __builtin_bswap64(__t);
+}
+
+/* REV16 */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev16(uint32_t __t) {
+  return __ror(__rev(__t), 16);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rev16ll(uint64_t __t) {
+  return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rev16l(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+    return __rev16(__t);
+#else
+    return __rev16ll(__t);
+#endif
+}
+
+/* REVSH */
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
+__revsh(int16_t __t) {
+  return __builtin_bswap16(__t);
+}
+
+/* RBIT */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rbit(uint32_t __t) {
+  return __builtin_arm_rbit(__t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rbitll(uint64_t __t) {
+#if __ARM_32BIT_STATE
+  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
+         __builtin_arm_rbit(__t >> 32);
+#else
+  return __builtin_arm_rbit64(__t);
+#endif
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rbitl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __rbit(__t);
+#else
+  return __rbitll(__t);
+#endif
+}
+
+/*
+ * 9.4 Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+/* 9.4.1 Width-specified saturation intrinsics */
+#if __ARM_32BIT_STATE
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+#endif
+
+/* 9.4.2 Saturating addition and subtraction intrinsics */
+#if __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qadd(int32_t __t, int32_t __v) {
+  return __builtin_arm_qadd(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qsub(int32_t __t, int32_t __v) {
+  return __builtin_arm_qsub(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qdbl(int32_t __t) {
+  return __builtin_arm_qadd(__t, __t);
+}
+#endif
+
+/* 9.7 CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32b(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32b(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32h(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32h(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32w(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32w(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32d(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32d(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cb(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32cb(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32ch(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32ch(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cw(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32cw(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cd(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32cd(__a, __b);
+}
+#endif
+
+/* 10.1 Special register intrinsics */
+#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
+#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
+#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
+#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/arm_neon.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/arm_neon.h
new file mode 100644
index 000000000..f5ca59bb3
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/arm_neon.h
@@ -0,0 +1,69231 @@
+/*===---- arm_neon.h - ARM Neon intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_NEON_H
+#define __ARM_NEON_H
+
+#if !defined(__ARM_NEON)
+#error "NEON support not enabled"
+#endif
+
+#include <stdint.h>
+
+typedef float float32_t;
+typedef __fp16 float16_t;
+#ifdef __aarch64__
+typedef double float64_t;
+#endif
+
+#ifdef __aarch64__
+typedef uint8_t poly8_t;
+typedef uint16_t poly16_t;
+typedef uint64_t poly64_t;
+typedef __uint128_t poly128_t;
+#else
+typedef int8_t poly8_t;
+typedef int16_t poly16_t;
+#endif
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
+typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
+typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
+typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
+typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
+typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
+typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
+#ifdef __aarch64__
+typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
+typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+#endif
+typedef __attribute__((neon_polyvector_type(8))) poly8_t poly8x8_t;
+typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
+typedef __attribute__((neon_polyvector_type(4))) poly16_t poly16x4_t;
+typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t;
+#ifdef __aarch64__
+typedef __attribute__((neon_polyvector_type(1))) poly64_t poly64x1_t;
+typedef __attribute__((neon_polyvector_type(2))) poly64_t poly64x2_t;
+#endif
+
+typedef struct int8x8x2_t {
+  int8x8_t val[2];
+} int8x8x2_t;
+
+typedef struct int8x16x2_t {
+  int8x16_t val[2];
+} int8x16x2_t;
+
+typedef struct int16x4x2_t {
+  int16x4_t val[2];
+} int16x4x2_t;
+
+typedef struct int16x8x2_t {
+  int16x8_t val[2];
+} int16x8x2_t;
+
+typedef struct int32x2x2_t {
+  int32x2_t val[2];
+} int32x2x2_t;
+
+typedef struct int32x4x2_t {
+  int32x4_t val[2];
+} int32x4x2_t;
+
+typedef struct int64x1x2_t {
+  int64x1_t val[2];
+} int64x1x2_t;
+
+typedef struct int64x2x2_t {
+  int64x2_t val[2];
+} int64x2x2_t;
+
+typedef struct uint8x8x2_t {
+  uint8x8_t val[2];
+} uint8x8x2_t;
+
+typedef struct uint8x16x2_t {
+  uint8x16_t val[2];
+} uint8x16x2_t;
+
+typedef struct uint16x4x2_t {
+  uint16x4_t val[2];
+} uint16x4x2_t;
+
+typedef struct uint16x8x2_t {
+  uint16x8_t val[2];
+} uint16x8x2_t;
+
+typedef struct uint32x2x2_t {
+  uint32x2_t val[2];
+} uint32x2x2_t;
+
+typedef struct uint32x4x2_t {
+  uint32x4_t val[2];
+} uint32x4x2_t;
+
+typedef struct uint64x1x2_t {
+  uint64x1_t val[2];
+} uint64x1x2_t;
+
+typedef struct uint64x2x2_t {
+  uint64x2_t val[2];
+} uint64x2x2_t;
+
+typedef struct float16x4x2_t {
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t {
+  float16x8_t val[2];
+} float16x8x2_t;
+
+typedef struct float32x2x2_t {
+  float32x2_t val[2];
+} float32x2x2_t;
+
+typedef struct float32x4x2_t {
+  float32x4_t val[2];
+} float32x4x2_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x2_t {
+  float64x1_t val[2];
+} float64x1x2_t;
+
+typedef struct float64x2x2_t {
+  float64x2_t val[2];
+} float64x2x2_t;
+
+#endif
+typedef struct poly8x8x2_t {
+  poly8x8_t val[2];
+} poly8x8x2_t;
+
+typedef struct poly8x16x2_t {
+  poly8x16_t val[2];
+} poly8x16x2_t;
+
+typedef struct poly16x4x2_t {
+  poly16x4_t val[2];
+} poly16x4x2_t;
+
+typedef struct poly16x8x2_t {
+  poly16x8_t val[2];
+} poly16x8x2_t;
+
+#ifdef __aarch64__
+typedef struct poly64x1x2_t {
+  poly64x1_t val[2];
+} poly64x1x2_t;
+
+typedef struct poly64x2x2_t {
+  poly64x2_t val[2];
+} poly64x2x2_t;
+
+#endif
+typedef struct int8x8x3_t {
+  int8x8_t val[3];
+} int8x8x3_t;
+
+typedef struct int8x16x3_t {
+  int8x16_t val[3];
+} int8x16x3_t;
+
+typedef struct int16x4x3_t {
+  int16x4_t val[3];
+} int16x4x3_t;
+
+typedef struct int16x8x3_t {
+  int16x8_t val[3];
+} int16x8x3_t;
+
+typedef struct int32x2x3_t {
+  int32x2_t val[3];
+} int32x2x3_t;
+
+typedef struct int32x4x3_t {
+  int32x4_t val[3];
+} int32x4x3_t;
+
+typedef struct int64x1x3_t {
+  int64x1_t val[3];
+} int64x1x3_t;
+
+typedef struct int64x2x3_t {
+  int64x2_t val[3];
+} int64x2x3_t;
+
+typedef struct uint8x8x3_t {
+  uint8x8_t val[3];
+} uint8x8x3_t;
+
+typedef struct uint8x16x3_t {
+  uint8x16_t val[3];
+} uint8x16x3_t;
+
+typedef struct uint16x4x3_t {
+  uint16x4_t val[3];
+} uint16x4x3_t;
+
+typedef struct uint16x8x3_t {
+  uint16x8_t val[3];
+} uint16x8x3_t;
+
+typedef struct uint32x2x3_t {
+  uint32x2_t val[3];
+} uint32x2x3_t;
+
+typedef struct uint32x4x3_t {
+  uint32x4_t val[3];
+} uint32x4x3_t;
+
+typedef struct uint64x1x3_t {
+  uint64x1_t val[3];
+} uint64x1x3_t;
+
+typedef struct uint64x2x3_t {
+  uint64x2_t val[3];
+} uint64x2x3_t;
+
+typedef struct float16x4x3_t {
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t {
+  float16x8_t val[3];
+} float16x8x3_t;
+
+typedef struct float32x2x3_t {
+  float32x2_t val[3];
+} float32x2x3_t;
+
+typedef struct float32x4x3_t {
+  float32x4_t val[3];
+} float32x4x3_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x3_t {
+  float64x1_t val[3];
+} float64x1x3_t;
+
+typedef struct float64x2x3_t {
+  float64x2_t val[3];
+} float64x2x3_t;
+
+#endif
+typedef struct poly8x8x3_t {
+  poly8x8_t val[3];
+} poly8x8x3_t;
+
+typedef struct poly8x16x3_t {
+  poly8x16_t val[3];
+} poly8x16x3_t;
+
+typedef struct poly16x4x3_t {
+  poly16x4_t val[3];
+} poly16x4x3_t;
+
+typedef struct poly16x8x3_t {
+  poly16x8_t val[3];
+} poly16x8x3_t;
+
+#ifdef __aarch64__
+typedef struct poly64x1x3_t {
+  poly64x1_t val[3];
+} poly64x1x3_t;
+
+typedef struct poly64x2x3_t {
+  poly64x2_t val[3];
+} poly64x2x3_t;
+
+#endif
+typedef struct int8x8x4_t {
+  int8x8_t val[4];
+} int8x8x4_t;
+
+typedef struct int8x16x4_t {
+  int8x16_t val[4];
+} int8x16x4_t;
+
+typedef struct int16x4x4_t {
+  int16x4_t val[4];
+} int16x4x4_t;
+
+typedef struct int16x8x4_t {
+  int16x8_t val[4];
+} int16x8x4_t;
+
+typedef struct int32x2x4_t {
+  int32x2_t val[4];
+} int32x2x4_t;
+
+typedef struct int32x4x4_t {
+  int32x4_t val[4];
+} int32x4x4_t;
+
+typedef struct int64x1x4_t {
+  int64x1_t val[4];
+} int64x1x4_t;
+
+typedef struct int64x2x4_t {
+  int64x2_t val[4];
+} int64x2x4_t;
+
+typedef struct uint8x8x4_t {
+  uint8x8_t val[4];
+} uint8x8x4_t;
+
+typedef struct uint8x16x4_t {
+  uint8x16_t val[4];
+} uint8x16x4_t;
+
+typedef struct uint16x4x4_t {
+  uint16x4_t val[4];
+} uint16x4x4_t;
+
+typedef struct uint16x8x4_t {
+  uint16x8_t val[4];
+} uint16x8x4_t;
+
+typedef struct uint32x2x4_t {
+  uint32x2_t val[4];
+} uint32x2x4_t;
+
+typedef struct uint32x4x4_t {
+  uint32x4_t val[4];
+} uint32x4x4_t;
+
+typedef struct uint64x1x4_t {
+  uint64x1_t val[4];
+} uint64x1x4_t;
+
+typedef struct uint64x2x4_t {
+  uint64x2_t val[4];
+} uint64x2x4_t;
+
+typedef struct float16x4x4_t {
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t {
+  float16x8_t val[4];
+} float16x8x4_t;
+
+typedef struct float32x2x4_t {
+  float32x2_t val[4];
+} float32x2x4_t;
+
+typedef struct float32x4x4_t {
+  float32x4_t val[4];
+} float32x4x4_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x4_t {
+  float64x1_t val[4];
+} float64x1x4_t;
+
+typedef struct float64x2x4_t {
+  float64x2_t val[4];
+} float64x2x4_t;
+
+#endif
+typedef struct poly8x8x4_t {
+  poly8x8_t val[4];
+} poly8x8x4_t;
+
+typedef struct poly8x16x4_t {
+  poly8x16_t val[4];
+} poly8x16x4_t;
+
+typedef struct poly16x4x4_t {
+  poly16x4_t val[4];
+} poly16x4x4_t;
+
+typedef struct poly16x8x4_t {
+  poly16x8_t val[4];
+} poly16x8x4_t;
+
+#ifdef __aarch64__
+typedef struct poly64x1x4_t {
+  poly64x1_t val[4];
+} poly64x1x4_t;
+
+typedef struct poly64x2x4_t {
+  poly64x2_t val[4];
+} poly64x2x4_t;
+
+#endif
+
+#define __ai static inline __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x16_t __noswap_vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x16_t __noswap_vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vabdq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vabdq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vabd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vabd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vabd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vabd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vabd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vabd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vabd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vabd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vabd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vabd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vabd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vabsq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vabsq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vabsq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vabsq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabsq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vabsq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabsq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vabsq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vabs_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabs_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vabs_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vabs_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabs_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vabs_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vabs_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabs_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vabs_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vabs_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabs_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vabs_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vandq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vandq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vandq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vandq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vandq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vandq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vandq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vandq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vandq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vandq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vandq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vandq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vandq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vandq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vandq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vandq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vand_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vand_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vand_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vand_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vand_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vand_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vand_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vand_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vand_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vand_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vand_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vand_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vand_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vand_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vand_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vand_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vbicq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vbicq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vbicq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vbicq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vbicq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vbicq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vbicq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vbicq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vbicq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vbicq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vbicq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vbicq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vbicq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vbicq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vbicq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vbicq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vbic_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vbic_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vbic_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vbic_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vbic_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vbic_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vbic_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vbic_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vbic_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vbic_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vbic_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vbic_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vbic_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vbic_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vbic_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vbic_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vbsl_p8(uint8x8_t __p0, poly8x8_t __p1, poly8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vbsl_p8(uint8x8_t __p0, poly8x8_t __p1, poly8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vbsl_p16(uint16x4_t __p0, poly16x4_t __p1, poly16x4_t __p2) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 5);
+  return __ret;
+}
+#else
+__ai poly16x4_t vbsl_p16(uint16x4_t __p0, poly16x4_t __p1, poly16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vbslq_p8(uint8x16_t __p0, poly8x16_t __p1, poly8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vbslq_p8(uint8x16_t __p0, poly8x16_t __p1, poly8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vbslq_p16(uint16x8_t __p0, poly16x8_t __p1, poly16x8_t __p2) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 37);
+  return __ret;
+}
+#else
+__ai poly16x8_t vbslq_p16(uint16x8_t __p0, poly16x8_t __p1, poly16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 37);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vbslq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vbslq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vbslq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vbslq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vbslq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vbslq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vbslq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vbslq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vbslq_s8(uint8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vbslq_s8(uint8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vbslq_f32(uint32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vbslq_f32(uint32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vbslq_s32(uint32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vbslq_s32(uint32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vbslq_s64(uint64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vbslq_s64(uint64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vbslq_s16(uint16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vbslq_s16(uint16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vbsl_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vbsl_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vbsl_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vbsl_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vbsl_u64(uint64x1_t __p0, uint64x1_t __p1, uint64x1_t __p2) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vbsl_u64(uint64x1_t __p0, uint64x1_t __p1, uint64x1_t __p2) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vbsl_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vbsl_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vbsl_s8(uint8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vbsl_s8(uint8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vbsl_f32(uint32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vbsl_f32(uint32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vbsl_s32(uint32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vbsl_s32(uint32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vbsl_s64(uint64x1_t __p0, int64x1_t __p1, int64x1_t __p2) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vbsl_s64(uint64x1_t __p0, int64x1_t __p1, int64x1_t __p2) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vbsl_s16(uint16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vbsl_s16(uint16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcageq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcageq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcage_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcage_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcage_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcagtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcagtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcagt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcagt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcagt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcaleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcaleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcale_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcale_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcale_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcaltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcaltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcalt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcalt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcalt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceq_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceq_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceq_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceq_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceq_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceq_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceq_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceq_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceq_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceq_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceq_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceq_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceq_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceq_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceq_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceq_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgeq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgeq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgeq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgeq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgeq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgeq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgeq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgeq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgeq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgeq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgeq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgeq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgeq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgeq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcge_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcge_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcge_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcge_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcge_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcge_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcge_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcge_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcge_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcge_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcge_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcge_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcge_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcge_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgtq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgtq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgtq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgtq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgt_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgt_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgt_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgt_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgt_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgt_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcleq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcleq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcleq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcleq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcleq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcleq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcleq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcleq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcleq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcleq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcleq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcleq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcle_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcle_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcle_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcle_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcle_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcle_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcle_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcle_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcle_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcle_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcle_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcle_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcle_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcle_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vclsq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vclsq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vclsq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vclsq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vclsq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vclsq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vcls_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vcls_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcls_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcls_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcls_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vcls_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcltq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcltq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcltq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcltq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclt_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclt_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclt_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclt_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclt_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclt_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vclzq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vclzq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vclzq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vclzq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vclzq_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vclzq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vclzq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vclzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vclzq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vclzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vclzq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vclzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclz_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclz_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclz_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclz_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclz_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclz_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vclz_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vclz_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vclz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vclz_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vclz_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vclz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vclz_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vclz_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vclz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vcnt_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vcnt_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vcntq_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vcntq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcntq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcntq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vcntq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vcntq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcnt_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcnt_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vcnt_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vcnt_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vcombine_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai poly8x16_t vcombine_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vcombine_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai poly16x8_t vcombine_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x16_t __noswap_vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcombine_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcombine_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai int8x16_t vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x16_t __noswap_vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai float32x4_t vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai float16x8_t vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai int32x4_t vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcombine_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai int64x2_t vcombine_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai int16x8_t vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vcreate_p8(uint64_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vcreate_p8(uint64_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vcreate_p16(uint64_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vcreate_p16(uint64_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcreate_u8(uint64_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcreate_u8(uint64_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcreate_u32(uint64_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcreate_u32(uint64_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcreate_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcreate_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcreate_u16(uint64_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcreate_u16(uint64_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vcreate_s8(uint64_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vcreate_s8(uint64_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcreate_f32(uint64_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vcreate_f32(uint64_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcreate_f16(uint64_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vcreate_f16(uint64_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcreate_s32(uint64_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vcreate_s32(uint64_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcreate_s64(uint64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vcreate_s64(uint64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcreate_s16(uint64_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vcreate_s16(uint64_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvtq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai float32x4_t vcvtq_f32_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvtq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai float32x4_t vcvtq_f32_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvt_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvt_f32_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvt_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvt_f32_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vcvt_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vcvt_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vcvtq_n_s32_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vcvtq_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vcvtq_n_s32_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vcvt_n_s32_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vcvt_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vcvt_n_s32_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_n_u32_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vcvtq_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_n_u32_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vcvt_n_u32_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vcvt_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vcvt_n_u32_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvt_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvt_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvt_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvt_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvt_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvt_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvt_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvt_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vdup_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x8_t vdup_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vdup_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x4_t vdup_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vdupq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x16_t vdupq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vdupq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x8_t vdupq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vdupq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x16_t vdupq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vdupq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x4_t vdupq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vdupq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint64x2_t vdupq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vdupq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x8_t vdupq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vdupq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x16_t vdupq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vdupq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x4_t vdupq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vdupq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vdupq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x4_t vdupq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vdupq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int64x2_t vdupq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vdupq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x8_t vdupq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vdup_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x8_t vdup_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vdup_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x2_t vdup_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vdup_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai uint64x1_t vdup_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vdup_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x4_t vdup_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vdup_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x8_t vdup_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vdup_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x2_t vdup_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vdup_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vdup_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x2_t vdup_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vdup_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai int64x1_t vdup_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vdup_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x4_t vdup_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t veorq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t veorq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t veorq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t veorq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t veorq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t veorq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t veorq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t veorq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t veorq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t veorq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t veorq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t veorq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t veorq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t veorq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t veorq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t veorq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t veor_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t veor_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t veor_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t veor_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t veor_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t veor_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t veor_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t veor_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t veor_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t veor_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t veor_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t veor_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t veor_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t veor_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t veor_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t veor_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vext_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vext_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vextq_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vextq_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vextq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vextq_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vextq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vextq_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 41); \
+  __ret; \
+})
+#else
+#define vextq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vextq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vextq_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vextq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vext_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vext_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vext_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vext_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vext_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 9); \
+  __ret; \
+})
+#else
+#define vext_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vext_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vext_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vext_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vget_high_p8(poly8x16_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vget_high_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai poly8x8_t __noswap_vget_high_p8(poly8x16_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vget_high_p16(poly16x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vget_high_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vget_high_u8(uint8x16_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vget_high_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vget_high_u8(uint8x16_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vget_high_u32(uint32x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vget_high_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vget_high_u32(uint32x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vget_high_u64(uint64x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vget_high_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vget_high_u16(uint16x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vget_high_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vget_high_u16(uint16x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vget_high_s8(int8x16_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vget_high_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vget_high_s8(int8x16_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vget_high_f32(float32x4_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vget_high_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vget_high_f32(float32x4_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vget_high_f16(float16x8_t __p0) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai float16x4_t vget_high_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vget_high_f16(float16x8_t __p0) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vget_high_s32(int32x4_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vget_high_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vget_high_s32(int32x4_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vget_high_s64(int64x2_t __p0) {
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai int64x1_t vget_high_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vget_high_s16(int16x8_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vget_high_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vget_high_s16(int16x8_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vget_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vget_lane_f32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vget_lane_f32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vget_lane_f32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vget_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vget_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vget_low_p8(poly8x16_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai poly8x8_t vget_low_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vget_low_p16(poly16x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai poly16x4_t vget_low_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vget_low_u8(uint8x16_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai uint8x8_t vget_low_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vget_low_u32(uint32x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vget_low_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vget_low_u64(uint64x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vget_low_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vget_low_u16(uint16x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai uint16x4_t vget_low_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vget_low_s8(int8x16_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai int8x8_t vget_low_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vget_low_f32(float32x4_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
+  return __ret;
+}
+#else
+__ai float32x2_t vget_low_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vget_low_f16(float16x8_t __p0) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai float16x4_t vget_low_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vget_low_s32(int32x4_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
+  return __ret;
+}
+#else
+__ai int32x2_t vget_low_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vget_low_s64(int64x2_t __p0) {
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai int64x1_t vget_low_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vget_low_s16(int16x8_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai int16x4_t vget_low_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vhsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vhsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vhsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vhsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vhsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vhsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vhsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vhsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vhsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vhsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vhsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vhsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vhsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vhsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vhsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vhsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vhsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vhsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vhsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vhsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vhsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vhsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_v(__p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_v(__p0, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_v(__p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_v(__p0, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_v(__p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_v(__p0, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_v(__p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_v(__p0, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_v(__p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_v(__p0, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_v(__p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_v(__p0, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_v(__p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_v(__p0, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_v(__p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_v(__p0, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_v(__p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_v(__p0, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_v(__p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_v(__p0, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_v(__p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_v(__p0, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_v(__p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_v(__p0, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_v(__p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_v(__p0, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_v(__p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_v(__p0, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_v(__p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_v(__p0, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_v(__p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_v(__p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_v(__p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_v(__p0, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_v(__p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_v(__p0, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_v(__p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_v(__p0, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_v(__p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_v(__p0, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_v(__p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_v(__p0, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_v(__p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_v(__p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_v(__p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_v(__p0, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_dup_v(__p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_dup_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_dup_v(__p0, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_dup_v(__p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_dup_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_dup_v(__p0, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_dup_v(__p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_dup_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_dup_v(__p0, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_dup_v(__p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_dup_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_dup_v(__p0, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_dup_v(__p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_dup_v(__p0, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_dup_v(__p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_dup_v(__p0, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_dup_v(__p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_dup_v(__p0, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_dup_v(__p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_dup_v(__p0, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_dup_v(__p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_dup_v(__p0, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_dup_v(__p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_dup_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_dup_v(__p0, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_dup_v(__p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_dup_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_dup_v(__p0, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_dup_v(__p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_dup_v(__p0, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_dup_v(__p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_dup_v(__p0, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_dup_v(__p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_dup_v(__p0, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_dup_v(__p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_dup_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_dup_v(__p0, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_dup_v(__p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_dup_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_dup_v(__p0, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_dup_v(__p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_dup_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_dup_v(__p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_dup_v(__p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_dup_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_dup_v(__p0, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_dup_v(__p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_dup_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_dup_v(__p0, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_dup_v(__p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_dup_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_dup_v(__p0, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_dup_v(__p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_dup_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_dup_v(__p0, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_dup_v(__p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_dup_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_dup_v(__p0, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_dup_v(__p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_dup_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_dup_v(__p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_dup_v(__p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_dup_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_dup_v(__p0, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vld1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vld1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vld1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vld1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 41); \
+  __ret; \
+})
+#else
+#define vld1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 40); \
+  __ret; \
+})
+#else
+#define vld1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vld1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vld1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vld1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vld1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vld1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 9); \
+  __ret; \
+})
+#else
+#define vld1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 8); \
+  __ret; \
+})
+#else
+#define vld1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vld1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vld1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vld1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld2_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld2_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld2q_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld2q_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld2q_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld2q_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld2q_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld2q_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld2q_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld2q_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld2q_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld2q_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld2_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld2_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld2_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld2_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld2_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld2_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld2_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld2_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld2_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld2_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld2_dup_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld2_dup_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld2_dup_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld2_dup_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld2_dup_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld2_dup_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld2_dup_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld2_dup_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld2_dup_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld2_dup_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld2_dup_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld2_dup_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 4); \
+  __ret; \
+})
+#else
+#define vld2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 5); \
+  __ret; \
+})
+#else
+#define vld2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 37); \
+  __ret; \
+})
+#else
+#define vld2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 50); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 49); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 41); \
+  __ret; \
+})
+#else
+#define vld2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 40); \
+  __ret; \
+})
+#else
+#define vld2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 34); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 33); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 16); \
+  __ret; \
+})
+#else
+#define vld2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 18); \
+  __ret; \
+})
+#else
+#define vld2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 17); \
+  __ret; \
+})
+#else
+#define vld2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 0); \
+  __ret; \
+})
+#else
+#define vld2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 9); \
+  __ret; \
+})
+#else
+#define vld2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 8); \
+  __ret; \
+})
+#else
+#define vld2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 2); \
+  __ret; \
+})
+#else
+#define vld2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 1); \
+  __ret; \
+})
+#else
+#define vld2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld3_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld3_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld3q_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld3q_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld3q_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld3q_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld3q_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld3q_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld3q_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld3q_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld3q_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld3q_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld3_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld3_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld3_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld3_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld3_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld3_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld3_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld3_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld3_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld3_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld3_dup_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld3_dup_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld3_dup_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld3_dup_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld3_dup_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld3_dup_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld3_dup_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld3_dup_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld3_dup_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld3_dup_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld3_dup_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld3_dup_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 4); \
+  __ret; \
+})
+#else
+#define vld3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 5); \
+  __ret; \
+})
+#else
+#define vld3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 37); \
+  __ret; \
+})
+#else
+#define vld3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 50); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 49); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 41); \
+  __ret; \
+})
+#else
+#define vld3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 40); \
+  __ret; \
+})
+#else
+#define vld3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 34); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 33); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 16); \
+  __ret; \
+})
+#else
+#define vld3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 18); \
+  __ret; \
+})
+#else
+#define vld3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 17); \
+  __ret; \
+})
+#else
+#define vld3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 0); \
+  __ret; \
+})
+#else
+#define vld3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 9); \
+  __ret; \
+})
+#else
+#define vld3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 8); \
+  __ret; \
+})
+#else
+#define vld3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 2); \
+  __ret; \
+})
+#else
+#define vld3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 1); \
+  __ret; \
+})
+#else
+#define vld3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld4_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld4_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld4q_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld4q_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld4q_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld4q_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld4q_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld4q_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld4q_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld4q_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld4q_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld4q_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld4_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld4_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld4_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld4_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld4_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld4_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld4_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld4_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld4_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld4_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld4_dup_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld4_dup_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld4_dup_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld4_dup_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld4_dup_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld4_dup_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld4_dup_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld4_dup_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld4_dup_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld4_dup_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld4_dup_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld4_dup_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 4); \
+  __ret; \
+})
+#else
+#define vld4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 5); \
+  __ret; \
+})
+#else
+#define vld4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 37); \
+  __ret; \
+})
+#else
+#define vld4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 50); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 49); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 41); \
+  __ret; \
+})
+#else
+#define vld4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 40); \
+  __ret; \
+})
+#else
+#define vld4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 34); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 33); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 16); \
+  __ret; \
+})
+#else
+#define vld4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 18); \
+  __ret; \
+})
+#else
+#define vld4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 17); \
+  __ret; \
+})
+#else
+#define vld4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 0); \
+  __ret; \
+})
+#else
+#define vld4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 9); \
+  __ret; \
+})
+#else
+#define vld4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 8); \
+  __ret; \
+})
+#else
+#define vld4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 2); \
+  __ret; \
+})
+#else
+#define vld4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 1); \
+  __ret; \
+})
+#else
+#define vld4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmlaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmlaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmlaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x16_t vmlaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x4_t vmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x4_t vmlaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x8_t vmlaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmla_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmla_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmla_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmla_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmla_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmla_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmla_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x8_t vmla_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x2_t vmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmla_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x2_t vmla_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmla_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x4_t vmla_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlaq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __p1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlaq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __rev1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlaq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __p1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlaq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __rev1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 + __p1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x4_t vmlaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 + __rev1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlaq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __p1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x4_t vmlaq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __rev1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlaq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __p1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x8_t vmlaq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __rev1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmla_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 + __p1 * (uint32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmla_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __rev1 * (uint32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmla_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 + __p1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmla_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __rev1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmla_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 + __p1 * (float32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x2_t vmla_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 + __rev1 * (float32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmla_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 + __p1 * (int32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x2_t vmla_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __rev1 * (int32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmla_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 + __p1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x4_t vmla_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __rev1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmlsq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmlsq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmlsq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x16_t vmlsq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x4_t vmlsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmls_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmls_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmls_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmls_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmls_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmls_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmls_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x8_t vmls_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmls_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x2_t vmls_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmls_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x2_t vmls_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmls_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x4_t vmls_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __p1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __rev1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - __p1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __rev1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 - __p1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x4_t vmlsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 - __rev1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __p1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __rev1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - __p1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __rev1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmls_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 - __p1 * (uint32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmls_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 - __rev1 * (uint32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmls_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 - __p1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmls_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 - __rev1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmls_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 - __p1 * (float32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x2_t vmls_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 - __rev1 * (float32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmls_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 - __p1 * (int32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x2_t vmls_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 - __rev1 * (int32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmls_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 - __p1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x4_t vmls_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 - __rev1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vmov_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x8_t vmov_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vmov_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x4_t vmov_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vmovq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x16_t vmovq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vmovq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x8_t vmovq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmovq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x16_t vmovq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmovq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmovq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint64x2_t vmovq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmovq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmovq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x16_t vmovq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmovq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x4_t vmovq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmovq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vmovq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x4_t vmovq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmovq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int64x2_t vmovq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x8_t vmovq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmov_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x8_t vmov_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmov_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmov_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vmov_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai uint64x1_t vmov_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmov_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmov_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmov_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x8_t vmov_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmov_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x2_t vmov_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmov_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vmov_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmov_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x2_t vmov_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vmov_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai int64x1_t vmov_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmov_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x4_t vmov_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovl_u8(uint8x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmovl_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmovl_u8(uint8x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 49);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmovl_u32(uint32x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmovl_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmovl_u32(uint32x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 51);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovl_u16(uint16x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmovl_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmovl_u16(uint16x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovl_s8(int8x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vmovl_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmovl_s8(int8x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmovl_s32(int32x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vmovl_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmovl_s32(int32x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovl_s16(int16x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmovl_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmovl_s16(int16x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vmovn_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vmovn_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vmovn_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vmovn_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vmovn_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vmovn_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmulq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmulq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmulq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmulq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmulq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmulq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmulq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vmulq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmulq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vmulq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmulq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vmulq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmulq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vmulq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmul_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmul_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmul_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmul_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmul_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmul_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmul_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vmul_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmul_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vmul_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmul_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vmul_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmul_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vmul_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vmul_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vmul_v((int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vmul_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vmul_v((int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vmulq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vmulq_v((int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vmulq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vmulq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmulq_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 * (uint32x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmulq_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 * (uint32x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmulq_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 * (uint16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmulq_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 * (uint16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmulq_n_f32(float32x4_t __p0, float32_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 * (float32x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai float32x4_t vmulq_n_f32(float32x4_t __p0, float32_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 * (float32x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmulq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 * (int32x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai int32x4_t vmulq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 * (int32x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmulq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 * (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai int16x8_t vmulq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 * (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmul_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 * (uint32x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmul_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 * (uint32x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmul_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 * (uint16x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmul_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 * (uint16x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmul_n_f32(float32x2_t __p0, float32_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 * (float32x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai float32x2_t vmul_n_f32(float32x2_t __p0, float32_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 * (float32x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmul_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 * (int32x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai int32x2_t vmul_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 * (int32x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmul_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 * (int16x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai int16x4_t vmul_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 * (int16x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8_t vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 37);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai poly16x8_t __noswap_vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 37);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 49);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 51);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmull_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vmull_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmull_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint32x2_t) {__p1, __p1}, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(uint32x2_t) {__p1, __p1}, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint32x2_t) {__p1, __p1}, 51);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint16x4_t) {__p1, __p1, __p1, __p1}, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(uint16x4_t) {__p1, __p1, __p1, __p1}, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint16x4_t) {__p1, __p1, __p1, __p1}, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vmvn_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai poly8x8_t vmvn_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vmvnq_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai poly8x16_t vmvnq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmvnq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmvnq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmvnq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmvnq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmvnq_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmvnq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmvnq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int8x16_t vmvnq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmvnq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int32x4_t vmvnq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmvnq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int16x8_t vmvnq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmvn_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmvn_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmvn_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmvn_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmvn_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmvn_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmvn_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int8x8_t vmvn_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmvn_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int32x2_t vmvn_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmvn_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int16x4_t vmvn_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vnegq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int8x16_t vnegq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vnegq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float32x4_t vnegq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vnegq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int32x4_t vnegq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vnegq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int16x8_t vnegq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vneg_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int8x8_t vneg_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vneg_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float32x2_t vneg_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vneg_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int32x2_t vneg_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vneg_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int16x4_t vneg_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vornq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vornq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vornq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vornq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vornq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vornq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vornq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vornq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vornq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vornq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vornq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vornq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vornq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vornq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vornq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vornq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vorn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vorn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vorn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vorn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vorn_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vorn_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vorn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vorn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vorn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vorn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vorn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vorn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vorn_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vorn_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vorn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vorn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vorrq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vorrq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vorrq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vorrq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vorrq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vorrq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vorrq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vorrq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vorrq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vorrq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vorrq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vorrq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vorrq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vorrq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vorrq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vorrq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vorr_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vorr_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vorr_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vorr_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vorr_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vorr_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vorr_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vorr_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vorr_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vorr_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vorr_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vorr_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vorr_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vorr_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vorr_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vorr_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpadalq_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpadalq_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vpadalq_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vpadalq_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpadalq_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpadalq_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpadalq_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpadalq_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vpadalq_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vpadalq_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpadalq_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpadalq_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpadal_u8(uint16x4_t __p0, uint8x8_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpadal_u8(uint16x4_t __p0, uint8x8_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vpadal_u32(uint64x1_t __p0, uint32x2_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vpadal_u32(uint64x1_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__rev1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpadal_u16(uint32x2_t __p0, uint16x4_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpadal_u16(uint32x2_t __p0, uint16x4_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpadal_s8(int16x4_t __p0, int8x8_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpadal_s8(int16x4_t __p0, int8x8_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vpadal_s32(int64x1_t __p0, int32x2_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vpadal_s32(int64x1_t __p0, int32x2_t __p1) {
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__rev1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpadal_s16(int32x2_t __p0, int16x4_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpadal_s16(int32x2_t __p0, int16x4_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vpadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vpadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vpadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vpadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpaddlq_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpaddlq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpaddlq_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpaddlq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vpaddlq_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vpaddlq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpaddlq_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpaddlq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpaddl_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpaddl_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vpaddl_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vpaddl_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpaddl_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpaddl_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpaddl_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpaddl_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vpaddl_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vpaddl_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpaddl_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpaddl_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vpmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vpmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vpmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vpmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vpmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vpmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vpmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vpmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqabsq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqabsq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqabsq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqabsq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqabsq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqabsq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqabs_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqabs_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqabs_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqabs_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqabs_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqabs_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqmovn_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vqmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqmovn_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vqmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqmovn_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vqmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqmovn_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqmovn_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqmovn_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vqmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqmovun_s32(int32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqmovun_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vqmovun_s32(int32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqmovun_s64(int64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqmovun_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vqmovun_s64(int64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqmovun_s16(int16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqmovun_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vqmovun_s16(int16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqnegq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqnegq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqnegq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqnegq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqnegq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqnegq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqneg_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqneg_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqneg_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqneg_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqneg_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqneg_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqrdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqrdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqrdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqrdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqrdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqrdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqrdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqrdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vqshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vqshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vqshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vqshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vqshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vqshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vqshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vqshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrecpeq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrecpeq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrecpeq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrecpeq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrecpe_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrecpe_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrecpe_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrecpe_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrecpsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrecpsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrecps_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrecps_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrev16_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrev16_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrev16q_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrev16q_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrev16q_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrev16q_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrev16q_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  return __ret;
+}
+#else
+__ai int8x16_t vrev16q_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrev16_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrev16_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrev16_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai int8x8_t vrev16_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrev32_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrev32_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vrev32_p16(poly16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai poly16x4_t vrev32_p16(poly16x4_t __p0) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrev32q_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrev32q_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vrev32q_p16(poly16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai poly16x8_t vrev32q_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrev32q_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrev32q_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrev32q_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrev32q_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrev32q_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  return __ret;
+}
+#else
+__ai int8x16_t vrev32q_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrev32q_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai int16x8_t vrev32q_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrev32_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrev32_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrev32_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrev32_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrev32_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai int8x8_t vrev32_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrev32_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai int16x4_t vrev32_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrev64_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrev64_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vrev64_p16(poly16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vrev64_p16(poly16x4_t __p0) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrev64q_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrev64q_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vrev64q_p16(poly16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai poly16x8_t vrev64q_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrev64q_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrev64q_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrev64q_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrev64q_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrev64q_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrev64q_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrev64q_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  return __ret;
+}
+#else
+__ai int8x16_t vrev64q_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrev64q_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai float32x4_t vrev64q_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrev64q_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai int32x4_t vrev64q_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrev64q_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai int16x8_t vrev64q_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrev64_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrev64_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrev64_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrev64_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrev64_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrev64_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrev64_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrev64_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrev64_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
+  return __ret;
+}
+#else
+__ai float32x2_t vrev64_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrev64_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
+  return __ret;
+}
+#else
+__ai int32x2_t vrev64_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrev64_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai int16x4_t vrev64_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vrhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vrhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vrhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vrhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vrhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vrshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vrshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vrshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vrshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vrshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vrshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vrshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vrshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrsqrteq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrsqrteq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrsqrteq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrsqrteq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrsqrte_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrsqrte_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrsqrte_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrsqrte_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrsqrtsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrsqrtsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrsqrts_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrsqrts_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vrsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vrsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vrsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vrsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vrsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vrsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vrsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vrsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vshll_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 49); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vshll_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 51); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vshll_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 50); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vshll_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 33); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vshll_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 35); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vshll_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 34); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vsli_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vsli_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vsliq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vsliq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vsliq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vsliq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vsliq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vsliq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vsliq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vsliq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vsliq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vsliq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vsli_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vsli_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vsli_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vsli_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vsli_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vsli_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vsli_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vsli_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vsri_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vsri_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vsriq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vsriq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vsriq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vsriq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vsriq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vsriq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vsriq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vsriq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vsriq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vsriq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vsri_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vsri_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vsri_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vsri_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vsri_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vsri_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vsri_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vsri_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 4); \
+})
+#else
+#define vst1_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 5); \
+})
+#else
+#define vst1_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 36); \
+})
+#else
+#define vst1q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 37); \
+})
+#else
+#define vst1q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 48); \
+})
+#else
+#define vst1q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 50); \
+})
+#else
+#define vst1q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 51); \
+})
+#else
+#define vst1q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 49); \
+})
+#else
+#define vst1q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 32); \
+})
+#else
+#define vst1q_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 41); \
+})
+#else
+#define vst1q_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 40); \
+})
+#else
+#define vst1q_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 34); \
+})
+#else
+#define vst1q_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 35); \
+})
+#else
+#define vst1q_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 33); \
+})
+#else
+#define vst1q_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 16); \
+})
+#else
+#define vst1_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 18); \
+})
+#else
+#define vst1_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 19); \
+})
+#else
+#define vst1_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 17); \
+})
+#else
+#define vst1_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 0); \
+})
+#else
+#define vst1_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 9); \
+})
+#else
+#define vst1_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 8); \
+})
+#else
+#define vst1_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 2); \
+})
+#else
+#define vst1_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 3); \
+})
+#else
+#define vst1_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 1); \
+})
+#else
+#define vst1_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 4); \
+})
+#else
+#define vst1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 5); \
+})
+#else
+#define vst1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 36); \
+})
+#else
+#define vst1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 37); \
+})
+#else
+#define vst1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 48); \
+})
+#else
+#define vst1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 50); \
+})
+#else
+#define vst1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 51); \
+})
+#else
+#define vst1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 49); \
+})
+#else
+#define vst1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 32); \
+})
+#else
+#define vst1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 41); \
+})
+#else
+#define vst1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 40); \
+})
+#else
+#define vst1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 34); \
+})
+#else
+#define vst1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 35); \
+})
+#else
+#define vst1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 33); \
+})
+#else
+#define vst1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 16); \
+})
+#else
+#define vst1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 18); \
+})
+#else
+#define vst1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+})
+#else
+#define vst1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 17); \
+})
+#else
+#define vst1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 0); \
+})
+#else
+#define vst1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 9); \
+})
+#else
+#define vst1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 8); \
+})
+#else
+#define vst1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 2); \
+})
+#else
+#define vst1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+})
+#else
+#define vst1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 1); \
+})
+#else
+#define vst1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
+})
+#else
+#define vst2_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
+})
+#else
+#define vst2_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
+})
+#else
+#define vst2q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
+})
+#else
+#define vst2q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
+})
+#else
+#define vst2q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
+})
+#else
+#define vst2q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
+})
+#else
+#define vst2q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
+})
+#else
+#define vst2q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 41); \
+})
+#else
+#define vst2q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 40); \
+})
+#else
+#define vst2q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 34); \
+})
+#else
+#define vst2q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 33); \
+})
+#else
+#define vst2q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
+})
+#else
+#define vst2_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
+})
+#else
+#define vst2_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#else
+#define vst2_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
+})
+#else
+#define vst2_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s8(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
+})
+#else
+#define vst2_s8(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_f32(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 9); \
+})
+#else
+#define vst2_f32(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_f16(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 8); \
+})
+#else
+#define vst2_f16(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s32(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 2); \
+})
+#else
+#define vst2_s32(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s64(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#else
+#define vst2_s64(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s16(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 1); \
+})
+#else
+#define vst2_s16(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 4); \
+})
+#else
+#define vst2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 5); \
+})
+#else
+#define vst2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 37); \
+})
+#else
+#define vst2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 50); \
+})
+#else
+#define vst2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 49); \
+})
+#else
+#define vst2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 41); \
+})
+#else
+#define vst2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 40); \
+})
+#else
+#define vst2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 34); \
+})
+#else
+#define vst2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 33); \
+})
+#else
+#define vst2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 16); \
+})
+#else
+#define vst2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 18); \
+})
+#else
+#define vst2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 17); \
+})
+#else
+#define vst2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 0); \
+})
+#else
+#define vst2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 9); \
+})
+#else
+#define vst2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 8); \
+})
+#else
+#define vst2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 2); \
+})
+#else
+#define vst2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 1); \
+})
+#else
+#define vst2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
+})
+#else
+#define vst3_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
+})
+#else
+#define vst3_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
+})
+#else
+#define vst3q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
+})
+#else
+#define vst3q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
+})
+#else
+#define vst3q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
+})
+#else
+#define vst3q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
+})
+#else
+#define vst3q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
+})
+#else
+#define vst3q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \
+})
+#else
+#define vst3q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \
+})
+#else
+#define vst3q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \
+})
+#else
+#define vst3q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \
+})
+#else
+#define vst3q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
+})
+#else
+#define vst3_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
+})
+#else
+#define vst3_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#else
+#define vst3_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
+})
+#else
+#define vst3_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s8(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
+})
+#else
+#define vst3_s8(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_f32(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \
+})
+#else
+#define vst3_f32(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_f16(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \
+})
+#else
+#define vst3_f16(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s32(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \
+})
+#else
+#define vst3_s32(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s64(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#else
+#define vst3_s64(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s16(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \
+})
+#else
+#define vst3_s16(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 4); \
+})
+#else
+#define vst3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 5); \
+})
+#else
+#define vst3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 37); \
+})
+#else
+#define vst3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 50); \
+})
+#else
+#define vst3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 49); \
+})
+#else
+#define vst3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 41); \
+})
+#else
+#define vst3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 40); \
+})
+#else
+#define vst3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 34); \
+})
+#else
+#define vst3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 33); \
+})
+#else
+#define vst3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 16); \
+})
+#else
+#define vst3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 18); \
+})
+#else
+#define vst3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 17); \
+})
+#else
+#define vst3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 0); \
+})
+#else
+#define vst3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 9); \
+})
+#else
+#define vst3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 8); \
+})
+#else
+#define vst3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 2); \
+})
+#else
+#define vst3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 1); \
+})
+#else
+#define vst3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
+})
+#else
+#define vst4_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
+})
+#else
+#define vst4_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
+})
+#else
+#define vst4q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
+})
+#else
+#define vst4q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
+})
+#else
+#define vst4q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
+})
+#else
+#define vst4q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
+})
+#else
+#define vst4q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
+})
+#else
+#define vst4q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \
+})
+#else
+#define vst4q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \
+})
+#else
+#define vst4q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \
+})
+#else
+#define vst4q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \
+})
+#else
+#define vst4q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
+})
+#else
+#define vst4_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
+})
+#else
+#define vst4_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#else
+#define vst4_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
+})
+#else
+#define vst4_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s8(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
+})
+#else
+#define vst4_s8(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_f32(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \
+})
+#else
+#define vst4_f32(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_f16(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \
+})
+#else
+#define vst4_f16(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s32(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \
+})
+#else
+#define vst4_s32(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s64(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#else
+#define vst4_s64(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s16(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \
+})
+#else
+#define vst4_s16(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 4); \
+})
+#else
+#define vst4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 5); \
+})
+#else
+#define vst4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 37); \
+})
+#else
+#define vst4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 50); \
+})
+#else
+#define vst4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 49); \
+})
+#else
+#define vst4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 41); \
+})
+#else
+#define vst4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 40); \
+})
+#else
+#define vst4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 34); \
+})
+#else
+#define vst4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 33); \
+})
+#else
+#define vst4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 16); \
+})
+#else
+#define vst4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 18); \
+})
+#else
+#define vst4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 17); \
+})
+#else
+#define vst4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 0); \
+})
+#else
+#define vst4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 9); \
+})
+#else
+#define vst4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 8); \
+})
+#else
+#define vst4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 2); \
+})
+#else
+#define vst4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 1); \
+})
+#else
+#define vst4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vsubq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vsubq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vsub_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vsub_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_u8(__p0) - vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_u8(__rev0) - __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_u32(__p0) - vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_u32(__rev0) - __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_u16(__p0) - vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_u16(__rev0) - __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_s8(__p0) - vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_s8(__rev0) - __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_s32(__p0) - vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_s32(__rev0) - __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_s16(__p0) - vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_s16(__rev0) - __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 - vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 - vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 - vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 - vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl1_p8(poly8x8_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl1_p8(poly8x8_t __p0, uint8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl2_p8(poly8x8x2_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl2_p8(poly8x8x2_t __p0, uint8x8_t __p1) {
+  poly8x8x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl2_u8(uint8x8x2_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl2_u8(uint8x8x2_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl2_s8(int8x8x2_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl2_s8(int8x8x2_t __p0, int8x8_t __p1) {
+  int8x8x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl3_p8(poly8x8x3_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl3_p8(poly8x8x3_t __p0, uint8x8_t __p1) {
+  poly8x8x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl3_u8(uint8x8x3_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl3_u8(uint8x8x3_t __p0, uint8x8_t __p1) {
+  uint8x8x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl3_s8(int8x8x3_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl3_s8(int8x8x3_t __p0, int8x8_t __p1) {
+  int8x8x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl4_p8(poly8x8x4_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl4_p8(poly8x8x4_t __p0, uint8x8_t __p1) {
+  poly8x8x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl4_u8(uint8x8x4_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl4_u8(uint8x8x4_t __p0, uint8x8_t __p1) {
+  uint8x8x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl4_s8(int8x8x4_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl4_s8(int8x8x4_t __p0, int8x8_t __p1) {
+  int8x8x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx1_p8(poly8x8_t __p0, poly8x8_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx1_p8(poly8x8_t __p0, poly8x8_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx1_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx1_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx1_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx1_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx2_p8(poly8x8_t __p0, poly8x8x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx2_p8(poly8x8_t __p0, poly8x8x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx2_u8(uint8x8_t __p0, uint8x8x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx2_u8(uint8x8_t __p0, uint8x8x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx2_s8(int8x8_t __p0, int8x8x2_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx2_s8(int8x8_t __p0, int8x8x2_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx3_p8(poly8x8_t __p0, poly8x8x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx3_p8(poly8x8_t __p0, poly8x8x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx3_u8(uint8x8_t __p0, uint8x8x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx3_u8(uint8x8_t __p0, uint8x8x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx3_s8(int8x8_t __p0, int8x8x3_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx3_s8(int8x8_t __p0, int8x8x3_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx4_p8(poly8x8_t __p0, poly8x8x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx4_p8(poly8x8_t __p0, poly8x8x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx4_u8(uint8x8_t __p0, uint8x8x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx4_u8(uint8x8_t __p0, uint8x8x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx4_s8(int8x8_t __p0, int8x8x4_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx4_s8(int8x8_t __p0, int8x8x4_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8x2_t vtrn_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8x2_t vtrn_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4x2_t vtrn_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4x2_t vtrn_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16x2_t vtrnq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16x2_t vtrnq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8x2_t vtrnq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8x2_t vtrnq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16x2_t vtrnq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16x2_t vtrnq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4x2_t vtrnq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4x2_t vtrnq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8x2_t vtrnq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8x2_t vtrnq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16x2_t vtrnq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16x2_t vtrnq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4x2_t vtrnq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4x2_t vtrnq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4x2_t vtrnq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4x2_t vtrnq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8x2_t vtrnq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8x2_t vtrnq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8x2_t vtrn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8x2_t vtrn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2x2_t vtrn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2x2_t vtrn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4x2_t vtrn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4x2_t vtrn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2x2_t vtrn_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2x2_t vtrn_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2x2_t vtrn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2x2_t vtrn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4x2_t vtrn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4x2_t vtrn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtst_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtst_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtst_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtst_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtstq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtstq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtstq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtstq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtstq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtstq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtstq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtstq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtstq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtstq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtstq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtstq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtstq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtstq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtstq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtstq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtst_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtst_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtst_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtst_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtst_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtst_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtst_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtst_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtst_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtst_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtst_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtst_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8x2_t vuzp_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8x2_t vuzp_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4x2_t vuzp_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4x2_t vuzp_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16x2_t vuzpq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16x2_t vuzpq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8x2_t vuzpq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8x2_t vuzpq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16x2_t vuzpq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16x2_t vuzpq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4x2_t vuzpq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4x2_t vuzpq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8x2_t vuzpq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8x2_t vuzpq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16x2_t vuzpq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16x2_t vuzpq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4x2_t vuzpq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4x2_t vuzpq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4x2_t vuzpq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4x2_t vuzpq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8x2_t vuzpq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8x2_t vuzpq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8x2_t vuzp_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8x2_t vuzp_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2x2_t vuzp_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2x2_t vuzp_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4x2_t vuzp_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4x2_t vuzp_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8x2_t vuzp_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8x2_t vuzp_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2x2_t vuzp_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2x2_t vuzp_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2x2_t vuzp_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2x2_t vuzp_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4x2_t vuzp_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4x2_t vuzp_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8x2_t vzip_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8x2_t vzip_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4x2_t vzip_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4x2_t vzip_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16x2_t vzipq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16x2_t vzipq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8x2_t vzipq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8x2_t vzipq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16x2_t vzipq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16x2_t vzipq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4x2_t vzipq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4x2_t vzipq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8x2_t vzipq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8x2_t vzipq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16x2_t vzipq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16x2_t vzipq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4x2_t vzipq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4x2_t vzipq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4x2_t vzipq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4x2_t vzipq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8x2_t vzipq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8x2_t vzipq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8x2_t vzip_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8x2_t vzip_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2x2_t vzip_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2x2_t vzip_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4x2_t vzip_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4x2_t vzip_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8x2_t vzip_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8x2_t vzip_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2x2_t vzip_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2x2_t vzip_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2x2_t vzip_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2x2_t vzip_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4x2_t vzip_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4x2_t vzip_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#if !defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#endif
+#if (__ARM_FP & 2)
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vcvt_f16_f32(float32x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcvt_f32_f16(float16x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtaq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtaq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtaq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtaq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvta_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvta_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvta_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvta_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtaq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtaq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtaq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtaq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvta_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvta_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvta_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvta_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtmq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtmq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtmq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtmq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvtm_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtm_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvtm_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtm_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtmq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtmq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtmq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtmq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvtm_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtm_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvtm_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtm_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtnq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtnq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtnq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtnq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvtn_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtn_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvtn_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtn_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtnq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtnq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtnq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtnq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvtn_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtn_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvtn_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtn_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtpq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtpq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtpq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtpq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvtp_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtp_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvtp_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtp_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtpq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtpq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtpq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtpq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvtp_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtp_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvtp_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtp_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrnd_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrnd_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndaq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndaq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrnda_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrnda_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndmq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndmq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndm_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndm_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndnq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndnq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndn_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndn_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndpq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndpq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndp_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndp_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndxq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndxq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndx_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndx_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtaq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtaq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtaq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtaq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvta_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvta_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvta_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvta_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtaq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtaq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtaq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtaq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvta_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvta_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvta_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvta_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtmq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtmq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtmq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtmq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvtm_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtm_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvtm_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtm_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtmq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtmq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtmq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtmq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvtm_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtm_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvtm_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtm_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtnq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtnq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtnq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtnq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvtn_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtn_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvtn_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtn_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtnq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtnq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtnq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtnq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvtn_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtn_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvtn_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtn_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtpq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtpq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtpq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtpq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvtp_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtp_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvtp_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtp_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtpq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtpq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtpq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtpq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvtp_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtp_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvtp_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtp_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_p64(poly64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_p64(poly64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f64(float64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f64(float64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_p8(poly8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_p8(poly8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_p16(poly16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_p16(poly16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u8(uint8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u8(uint8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u32(uint32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u32(uint32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u64(uint64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u64(uint64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u16(uint16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u16(uint16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s8(int8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s8(int8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_f64(float64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_f64(float64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_f32(float32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_f32(float32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_f16(float16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_f16(float16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s32(int32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s32(int32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s64(int64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s64(int64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s16(int16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s16(int16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_p64(poly64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_p64(poly64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f64(float64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f64(float64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p128(poly128_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p128(poly128_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p64(poly64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p64(poly64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f64(float64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f64(float64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_p8(poly8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_p8(poly8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_p64(poly64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_p64(poly64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_p16(poly16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_p16(poly16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u8(uint8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u8(uint8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u32(uint32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u32(uint32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u64(uint64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u64(uint64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u16(uint16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u16(uint16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s8(int8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s8(int8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_f64(float64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_f64(float64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_f32(float32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_f32(float32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_f16(float16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_f16(float16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s32(int32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s32(int32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s64(int64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s64(int64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s16(int16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s16(int16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_p8(poly8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_p8(poly8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_p128(poly128_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_p128(poly128_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_p16(poly16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_p16(poly16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u8(uint8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u8(uint8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u32(uint32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u32(uint32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u64(uint64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u64(uint64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u16(uint16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u16(uint16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s8(int8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s8(int8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_f64(float64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_f64(float64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_f32(float32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_f32(float32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_f16(float16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_f16(float16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s32(int32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s32(int32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s64(int64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s64(int64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s16(int16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s16(int16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p128(poly128_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p128(poly128_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p64(poly64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p64(poly64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f64(float64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f64(float64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p128(poly128_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p128(poly128_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p64(poly64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p64(poly64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f64(float64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f64(float64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p128(poly128_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p128(poly128_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p64(poly64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p64(poly64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f64(float64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f64(float64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p128(poly128_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p128(poly128_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p64(poly64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p64(poly64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p128(poly128_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p128(poly128_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p64(poly64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p64(poly64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f64(float64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f64(float64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p128(poly128_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p128(poly128_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p64(poly64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p64(poly64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f64(float64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f64(float64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p8(poly8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p8(poly8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p128(poly128_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p128(poly128_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p64(poly64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p64(poly64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p16(poly16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p16(poly16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u8(uint8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u8(uint8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u32(uint32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u32(uint32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u64(uint64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u64(uint64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u16(uint16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u16(uint16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s8(int8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s8(int8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_f32(float32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_f32(float32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_f16(float16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_f16(float16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s32(int32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s32(int32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s64(int64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s64(int64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s16(int16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s16(int16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p128(poly128_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p128(poly128_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p64(poly64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p64(poly64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_f64(float64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_f64(float64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p128(poly128_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p128(poly128_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p64(poly64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p64(poly64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_f64(float64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_f64(float64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p128(poly128_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p128(poly128_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p64(poly64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p64(poly64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f64(float64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f64(float64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p128(poly128_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p128(poly128_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p64(poly64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p64(poly64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p128(poly128_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p128(poly128_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p64(poly64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p64(poly64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f64(float64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f64(float64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p64(poly64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p64(poly64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f64(float64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f64(float64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p64(poly64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p64(poly64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f64(float64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f64(float64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p64(poly64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p64(poly64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f64(float64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f64(float64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p64(poly64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p64(poly64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f64(float64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f64(float64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_p8(poly8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_p8(poly8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_p64(poly64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_p64(poly64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_p16(poly16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_p16(poly16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u8(uint8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u8(uint8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u32(uint32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u32(uint32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u16(uint16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u16(uint16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s8(int8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s8(int8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_f32(float32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_f32(float32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_f16(float16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_f16(float16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s32(int32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s32(int32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s16(int16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s16(int16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p64(poly64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p64(poly64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_f64(float64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_f64(float64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p64(poly64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p64(poly64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_f64(float64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_f64(float64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p64(poly64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p64(poly64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f64(float64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f64(float64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p64(poly64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p64(poly64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p64(poly64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p64(poly64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f64(float64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f64(float64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__aarch64__) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrnd_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrnd_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndaq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndaq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrnda_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrnda_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndiq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndiq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndi_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndi_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndi_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndi_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndmq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndmq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndm_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndm_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndnq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndnq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndn_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndn_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndpq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndpq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndp_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndp_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndxq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndxq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndx_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndx_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__aarch64__) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmaxnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmaxnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vminnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vminnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_FEATURE_CRYPTO
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vsha1h_u32(uint32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vsha1h_u32(uint32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = vfmaq_f32(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = vfma_f32(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = __noswap_vfma_f32(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = vqaddq_s32(__p0, vqrdmulhq_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqaddq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmlahq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = vqaddq_s16(__p0, vqrdmulhq_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmlahq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vqaddq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmlah_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = vqadd_s32(__p0, vqrdmulh_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmlah_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __noswap_vqadd_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = vqadd_s16(__p0, vqrdmulh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __noswap_vqadd_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqaddq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqaddq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqaddq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqaddq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqadd_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqadd_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqadd_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqadd_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmlshq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = vqsubq_s32(__p0, vqrdmulhq_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmlshq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqsubq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmlshq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = vqsubq_s16(__p0, vqrdmulhq_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmlshq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vqsubq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmlsh_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = vqsub_s32(__p0, vqrdmulh_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmlsh_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __noswap_vqsub_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = vqsub_s16(__p0, vqrdmulh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __noswap_vqsub_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqsubq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqsubq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqsubq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqsubq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqsub_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqsub_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqsub_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqsub_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqaddq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqaddq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqaddq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqaddq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqadd_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqadd_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqadd_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqadd_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqsubq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqsubq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqsubq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqsubq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqsub_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqsub_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqsub_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqsub_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#endif
+#if defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vabdq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vabdq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vabd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vabd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vabdd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vabdd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vabdd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vabdd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vabds_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vabds_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vabds_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vabds_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vabsq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vabsq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabsq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vabsq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vabs_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vabs_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vabsd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vabsd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vadd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vadd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vaddhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vaddhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vaddhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vaddhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vaddhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vaddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vaddhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vaddhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vaddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vaddhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vaddhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vaddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vaddhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vaddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vaddhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vaddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vaddhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddlvq_u8(uint8x16_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddlvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddlvq_u32(uint32x4_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vaddlvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddlvq_u16(uint16x8_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddlvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddlvq_s8(int8x16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddlvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddlvq_s32(int32x4_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vaddlvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddlvq_s16(int16x8_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddlvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddlv_u8(uint8x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddlv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddlv_u32(uint32x2_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vaddlv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddlv_u16(uint16x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddlv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddlv_s8(int8x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddlv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddlv_s32(int32x2_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vaddlv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddlv_s16(int16x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddlv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vaddvq_u8(uint8x16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vaddvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddvq_u32(uint32x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddvq_u64(uint64x2_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddvq_u64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vaddvq_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddvq_u64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddvq_u16(uint16x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vaddvq_s8(int8x16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vaddvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vaddvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vaddvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vaddvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vaddvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vaddvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vaddvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddvq_s32(int32x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddvq_s64(int64x2_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddvq_s64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vaddvq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddvq_s64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddvq_s16(int16x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vaddv_u8(uint8x8_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vaddv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddv_u32(uint32x2_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddv_u16(uint16x4_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vaddv_s8(int8x8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vaddv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vaddv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vaddv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddv_s32(int32x2_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddv_s16(int16x4_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vbsl_p64(uint64x1_t __p0, poly64x1_t __p1, poly64x1_t __p2) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 6);
+  return __ret;
+}
+#else
+__ai poly64x1_t vbsl_p64(uint64x1_t __p0, poly64x1_t __p1, poly64x1_t __p2) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 6);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vbslq_p64(uint64x2_t __p0, poly64x2_t __p1, poly64x2_t __p2) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 38);
+  return __ret;
+}
+#else
+__ai poly64x2_t vbslq_p64(uint64x2_t __p0, poly64x2_t __p1, poly64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 38);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vbslq_f64(uint64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vbslq_f64(uint64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vbsl_f64(uint64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vbsl_f64(uint64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcageq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcageq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcage_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcage_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcaged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaged_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcaged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaged_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcages_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcages_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcages_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcages_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcagtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcagtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcagt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcagt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcagtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcagtd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcagtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcagtd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcagts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcagts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcagts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcagts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcaleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcaleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcale_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcale_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcaled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaled_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcaled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaled_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcales_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcales_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcales_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcales_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcaltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcaltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcalt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcalt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcaltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaltd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcaltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaltd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcalts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcalts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcalts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcalts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vceqd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vceqd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vceqd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vceqs_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqs_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vceqs_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqs_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceqz_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceqz_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqzq_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqzq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_p64(poly64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_p64(poly64x2_t __p0) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqzq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqzq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqzq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqzq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_u64(uint64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqzq_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqzq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqzq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqzq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqzq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqzq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqzq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceqz_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceqz_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceqz_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceqz_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_u64(uint64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_u64(uint64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceqz_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceqz_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceqz_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceqz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceqz_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceqz_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceqz_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceqz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceqz_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceqz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqzd_u64(uint64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vceqzd_u64(uint64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vceqzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqzd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vceqzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqzd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vceqzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vceqzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqzs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vceqzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqzs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgeq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgeq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgeq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgeq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgeq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgeq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcge_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcge_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcge_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcge_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcged_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcged_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcged_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcged_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcges_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcges_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcges_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcges_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgezq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgezq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgezq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgezq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgezq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgezq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgezq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgezq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgezq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgezq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgezq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgezq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgez_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgez_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgez_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgez_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgez_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgez_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgez_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgez_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcgezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgezd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcgezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgezd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgezd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcgezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgezd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcgezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgezs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcgezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgezs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcgtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcgts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcgts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgtzq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgtzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtzq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtzq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtzq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtzq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtzq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtzq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtzq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtzq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgtz_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgtz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgtz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgtz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgtz_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgtz_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgtz_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgtz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgtz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgtz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgtz_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgtz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcgtzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtzd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcgtzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtzd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgtzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtzd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcgtzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtzd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcgtzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgtzs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcgtzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgtzs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcleq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcleq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcleq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcleq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcle_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcle_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcle_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcle_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcle_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcle_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcled_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcled_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcled_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcled_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcles_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcles_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcles_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcles_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vclezq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vclezq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vclezq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vclezq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vclezq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vclezq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vclezq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vclezq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vclezq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vclezq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vclezq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vclezq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclez_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclez_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclez_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclez_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclez_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclez_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclez_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclez_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclez_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vclezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vclezd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vclezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vclezd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vclezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vclezd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vclezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vclezd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vclezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclezs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vclezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclezs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcltd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcltd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vclts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vclts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcltzq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcltzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltzq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltzq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltzq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltzq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltzq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltzq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltzq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltzq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcltz_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcltz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcltz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcltz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcltz_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcltz_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcltz_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcltz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcltz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcltz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcltz_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcltz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcltzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltzd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcltzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltzd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcltzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltzd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcltzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltzd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcltzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcltzs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcltzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcltzs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vcombine_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai poly64x2_t vcombine_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_p8(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \
+  poly8x16_t __s0_0 = __p0_0; \
+  poly8x8_t __s2_0 = __p2_0; \
+  poly8x16_t __ret_0; \
+  __ret_0 = vsetq_lane_p8(vget_lane_p8(__s2_0, __p3_0), __s0_0, __p1_0); \
+  __ret_0; \
+})
+#else
+#define vcopyq_lane_p8(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \
+  poly8x16_t __s0_1 = __p0_1; \
+  poly8x8_t __s2_1 = __p2_1; \
+  poly8x16_t __rev0_1;  __rev0_1 = __builtin_shufflevector(__s0_1, __s0_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_1;  __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_1; \
+  __ret_1 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_1, __p3_1), __rev0_1, __p1_1); \
+  __ret_1 = __builtin_shufflevector(__ret_1, __ret_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_1; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_p16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \
+  poly16x8_t __s0_2 = __p0_2; \
+  poly16x4_t __s2_2 = __p2_2; \
+  poly16x8_t __ret_2; \
+  __ret_2 = vsetq_lane_p16(vget_lane_p16(__s2_2, __p3_2), __s0_2, __p1_2); \
+  __ret_2; \
+})
+#else
+#define vcopyq_lane_p16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \
+  poly16x8_t __s0_3 = __p0_3; \
+  poly16x4_t __s2_3 = __p2_3; \
+  poly16x8_t __rev0_3;  __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __rev2_3;  __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \
+  poly16x8_t __ret_3; \
+  __ret_3 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_3, __p3_3), __rev0_3, __p1_3); \
+  __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_3; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u8(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \
+  uint8x16_t __s0_4 = __p0_4; \
+  uint8x8_t __s2_4 = __p2_4; \
+  uint8x16_t __ret_4; \
+  __ret_4 = vsetq_lane_u8(vget_lane_u8(__s2_4, __p3_4), __s0_4, __p1_4); \
+  __ret_4; \
+})
+#else
+#define vcopyq_lane_u8(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \
+  uint8x16_t __s0_5 = __p0_5; \
+  uint8x8_t __s2_5 = __p2_5; \
+  uint8x16_t __rev0_5;  __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_5;  __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_5; \
+  __ret_5 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_5, __p3_5), __rev0_5, __p1_5); \
+  __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_5; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u32(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \
+  uint32x4_t __s0_6 = __p0_6; \
+  uint32x2_t __s2_6 = __p2_6; \
+  uint32x4_t __ret_6; \
+  __ret_6 = vsetq_lane_u32(vget_lane_u32(__s2_6, __p3_6), __s0_6, __p1_6); \
+  __ret_6; \
+})
+#else
+#define vcopyq_lane_u32(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \
+  uint32x4_t __s0_7 = __p0_7; \
+  uint32x2_t __s2_7 = __p2_7; \
+  uint32x4_t __rev0_7;  __rev0_7 = __builtin_shufflevector(__s0_7, __s0_7, 3, 2, 1, 0); \
+  uint32x2_t __rev2_7;  __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 1, 0); \
+  uint32x4_t __ret_7; \
+  __ret_7 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_7, __p3_7), __rev0_7, __p1_7); \
+  __ret_7 = __builtin_shufflevector(__ret_7, __ret_7, 3, 2, 1, 0); \
+  __ret_7; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u64(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \
+  uint64x2_t __s0_8 = __p0_8; \
+  uint64x1_t __s2_8 = __p2_8; \
+  uint64x2_t __ret_8; \
+  __ret_8 = vsetq_lane_u64(vget_lane_u64(__s2_8, __p3_8), __s0_8, __p1_8); \
+  __ret_8; \
+})
+#else
+#define vcopyq_lane_u64(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \
+  uint64x2_t __s0_9 = __p0_9; \
+  uint64x1_t __s2_9 = __p2_9; \
+  uint64x2_t __rev0_9;  __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 1, 0); \
+  uint64x2_t __ret_9; \
+  __ret_9 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_9, __p3_9), __rev0_9, __p1_9); \
+  __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 1, 0); \
+  __ret_9; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \
+  uint16x8_t __s0_10 = __p0_10; \
+  uint16x4_t __s2_10 = __p2_10; \
+  uint16x8_t __ret_10; \
+  __ret_10 = vsetq_lane_u16(vget_lane_u16(__s2_10, __p3_10), __s0_10, __p1_10); \
+  __ret_10; \
+})
+#else
+#define vcopyq_lane_u16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \
+  uint16x8_t __s0_11 = __p0_11; \
+  uint16x4_t __s2_11 = __p2_11; \
+  uint16x8_t __rev0_11;  __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2_11;  __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 3, 2, 1, 0); \
+  uint16x8_t __ret_11; \
+  __ret_11 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_11, __p3_11), __rev0_11, __p1_11); \
+  __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_11; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \
+  int8x16_t __s0_12 = __p0_12; \
+  int8x8_t __s2_12 = __p2_12; \
+  int8x16_t __ret_12; \
+  __ret_12 = vsetq_lane_s8(vget_lane_s8(__s2_12, __p3_12), __s0_12, __p1_12); \
+  __ret_12; \
+})
+#else
+#define vcopyq_lane_s8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \
+  int8x16_t __s0_13 = __p0_13; \
+  int8x8_t __s2_13 = __p2_13; \
+  int8x16_t __rev0_13;  __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_13;  __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_13; \
+  __ret_13 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_13, __p3_13), __rev0_13, __p1_13); \
+  __ret_13 = __builtin_shufflevector(__ret_13, __ret_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_13; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_f32(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \
+  float32x4_t __s0_14 = __p0_14; \
+  float32x2_t __s2_14 = __p2_14; \
+  float32x4_t __ret_14; \
+  __ret_14 = vsetq_lane_f32(vget_lane_f32(__s2_14, __p3_14), __s0_14, __p1_14); \
+  __ret_14; \
+})
+#else
+#define vcopyq_lane_f32(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \
+  float32x4_t __s0_15 = __p0_15; \
+  float32x2_t __s2_15 = __p2_15; \
+  float32x4_t __rev0_15;  __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 3, 2, 1, 0); \
+  float32x2_t __rev2_15;  __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 1, 0); \
+  float32x4_t __ret_15; \
+  __ret_15 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_15, __p3_15), __rev0_15, __p1_15); \
+  __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 3, 2, 1, 0); \
+  __ret_15; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s32(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \
+  int32x4_t __s0_16 = __p0_16; \
+  int32x2_t __s2_16 = __p2_16; \
+  int32x4_t __ret_16; \
+  __ret_16 = vsetq_lane_s32(vget_lane_s32(__s2_16, __p3_16), __s0_16, __p1_16); \
+  __ret_16; \
+})
+#else
+#define vcopyq_lane_s32(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \
+  int32x4_t __s0_17 = __p0_17; \
+  int32x2_t __s2_17 = __p2_17; \
+  int32x4_t __rev0_17;  __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 3, 2, 1, 0); \
+  int32x2_t __rev2_17;  __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 1, 0); \
+  int32x4_t __ret_17; \
+  __ret_17 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_17, __p3_17), __rev0_17, __p1_17); \
+  __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 3, 2, 1, 0); \
+  __ret_17; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s64(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \
+  int64x2_t __s0_18 = __p0_18; \
+  int64x1_t __s2_18 = __p2_18; \
+  int64x2_t __ret_18; \
+  __ret_18 = vsetq_lane_s64(vget_lane_s64(__s2_18, __p3_18), __s0_18, __p1_18); \
+  __ret_18; \
+})
+#else
+#define vcopyq_lane_s64(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \
+  int64x2_t __s0_19 = __p0_19; \
+  int64x1_t __s2_19 = __p2_19; \
+  int64x2_t __rev0_19;  __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 1, 0); \
+  int64x2_t __ret_19; \
+  __ret_19 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_19, __p3_19), __rev0_19, __p1_19); \
+  __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 1, 0); \
+  __ret_19; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s16(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \
+  int16x8_t __s0_20 = __p0_20; \
+  int16x4_t __s2_20 = __p2_20; \
+  int16x8_t __ret_20; \
+  __ret_20 = vsetq_lane_s16(vget_lane_s16(__s2_20, __p3_20), __s0_20, __p1_20); \
+  __ret_20; \
+})
+#else
+#define vcopyq_lane_s16(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \
+  int16x8_t __s0_21 = __p0_21; \
+  int16x4_t __s2_21 = __p2_21; \
+  int16x8_t __rev0_21;  __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_21;  __rev2_21 = __builtin_shufflevector(__s2_21, __s2_21, 3, 2, 1, 0); \
+  int16x8_t __ret_21; \
+  __ret_21 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_21, __p3_21), __rev0_21, __p1_21); \
+  __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_21; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_p8(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \
+  poly8x8_t __s0_22 = __p0_22; \
+  poly8x8_t __s2_22 = __p2_22; \
+  poly8x8_t __ret_22; \
+  __ret_22 = vset_lane_p8(vget_lane_p8(__s2_22, __p3_22), __s0_22, __p1_22); \
+  __ret_22; \
+})
+#else
+#define vcopy_lane_p8(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \
+  poly8x8_t __s0_23 = __p0_23; \
+  poly8x8_t __s2_23 = __p2_23; \
+  poly8x8_t __rev0_23;  __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_23;  __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_23; \
+  __ret_23 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_23, __p3_23), __rev0_23, __p1_23); \
+  __ret_23 = __builtin_shufflevector(__ret_23, __ret_23, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_23; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_p16(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \
+  poly16x4_t __s0_24 = __p0_24; \
+  poly16x4_t __s2_24 = __p2_24; \
+  poly16x4_t __ret_24; \
+  __ret_24 = vset_lane_p16(vget_lane_p16(__s2_24, __p3_24), __s0_24, __p1_24); \
+  __ret_24; \
+})
+#else
+#define vcopy_lane_p16(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \
+  poly16x4_t __s0_25 = __p0_25; \
+  poly16x4_t __s2_25 = __p2_25; \
+  poly16x4_t __rev0_25;  __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 3, 2, 1, 0); \
+  poly16x4_t __rev2_25;  __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 3, 2, 1, 0); \
+  poly16x4_t __ret_25; \
+  __ret_25 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_25, __p3_25), __rev0_25, __p1_25); \
+  __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 3, 2, 1, 0); \
+  __ret_25; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u8(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \
+  uint8x8_t __s0_26 = __p0_26; \
+  uint8x8_t __s2_26 = __p2_26; \
+  uint8x8_t __ret_26; \
+  __ret_26 = vset_lane_u8(vget_lane_u8(__s2_26, __p3_26), __s0_26, __p1_26); \
+  __ret_26; \
+})
+#else
+#define vcopy_lane_u8(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \
+  uint8x8_t __s0_27 = __p0_27; \
+  uint8x8_t __s2_27 = __p2_27; \
+  uint8x8_t __rev0_27;  __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_27;  __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_27; \
+  __ret_27 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_27, __p3_27), __rev0_27, __p1_27); \
+  __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_27; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \
+  uint32x2_t __s0_28 = __p0_28; \
+  uint32x2_t __s2_28 = __p2_28; \
+  uint32x2_t __ret_28; \
+  __ret_28 = vset_lane_u32(vget_lane_u32(__s2_28, __p3_28), __s0_28, __p1_28); \
+  __ret_28; \
+})
+#else
+#define vcopy_lane_u32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \
+  uint32x2_t __s0_29 = __p0_29; \
+  uint32x2_t __s2_29 = __p2_29; \
+  uint32x2_t __rev0_29;  __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 1, 0); \
+  uint32x2_t __rev2_29;  __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \
+  uint32x2_t __ret_29; \
+  __ret_29 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_29, __p3_29), __rev0_29, __p1_29); \
+  __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 1, 0); \
+  __ret_29; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \
+  uint64x1_t __s0_30 = __p0_30; \
+  uint64x1_t __s2_30 = __p2_30; \
+  uint64x1_t __ret_30; \
+  __ret_30 = vset_lane_u64(vget_lane_u64(__s2_30, __p3_30), __s0_30, __p1_30); \
+  __ret_30; \
+})
+#else
+#define vcopy_lane_u64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \
+  uint64x1_t __s0_31 = __p0_31; \
+  uint64x1_t __s2_31 = __p2_31; \
+  uint64x1_t __ret_31; \
+  __ret_31 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_31, __p3_31), __s0_31, __p1_31); \
+  __ret_31; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \
+  uint16x4_t __s0_32 = __p0_32; \
+  uint16x4_t __s2_32 = __p2_32; \
+  uint16x4_t __ret_32; \
+  __ret_32 = vset_lane_u16(vget_lane_u16(__s2_32, __p3_32), __s0_32, __p1_32); \
+  __ret_32; \
+})
+#else
+#define vcopy_lane_u16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \
+  uint16x4_t __s0_33 = __p0_33; \
+  uint16x4_t __s2_33 = __p2_33; \
+  uint16x4_t __rev0_33;  __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 3, 2, 1, 0); \
+  uint16x4_t __rev2_33;  __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \
+  uint16x4_t __ret_33; \
+  __ret_33 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_33, __p3_33), __rev0_33, __p1_33); \
+  __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 3, 2, 1, 0); \
+  __ret_33; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \
+  int8x8_t __s0_34 = __p0_34; \
+  int8x8_t __s2_34 = __p2_34; \
+  int8x8_t __ret_34; \
+  __ret_34 = vset_lane_s8(vget_lane_s8(__s2_34, __p3_34), __s0_34, __p1_34); \
+  __ret_34; \
+})
+#else
+#define vcopy_lane_s8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \
+  int8x8_t __s0_35 = __p0_35; \
+  int8x8_t __s2_35 = __p2_35; \
+  int8x8_t __rev0_35;  __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_35;  __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_35; \
+  __ret_35 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_35, __p3_35), __rev0_35, __p1_35); \
+  __ret_35 = __builtin_shufflevector(__ret_35, __ret_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_35; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_f32(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \
+  float32x2_t __s0_36 = __p0_36; \
+  float32x2_t __s2_36 = __p2_36; \
+  float32x2_t __ret_36; \
+  __ret_36 = vset_lane_f32(vget_lane_f32(__s2_36, __p3_36), __s0_36, __p1_36); \
+  __ret_36; \
+})
+#else
+#define vcopy_lane_f32(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \
+  float32x2_t __s0_37 = __p0_37; \
+  float32x2_t __s2_37 = __p2_37; \
+  float32x2_t __rev0_37;  __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 1, 0); \
+  float32x2_t __rev2_37;  __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 1, 0); \
+  float32x2_t __ret_37; \
+  __ret_37 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_37, __p3_37), __rev0_37, __p1_37); \
+  __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 1, 0); \
+  __ret_37; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s32(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \
+  int32x2_t __s0_38 = __p0_38; \
+  int32x2_t __s2_38 = __p2_38; \
+  int32x2_t __ret_38; \
+  __ret_38 = vset_lane_s32(vget_lane_s32(__s2_38, __p3_38), __s0_38, __p1_38); \
+  __ret_38; \
+})
+#else
+#define vcopy_lane_s32(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \
+  int32x2_t __s0_39 = __p0_39; \
+  int32x2_t __s2_39 = __p2_39; \
+  int32x2_t __rev0_39;  __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 1, 0); \
+  int32x2_t __rev2_39;  __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 1, 0); \
+  int32x2_t __ret_39; \
+  __ret_39 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_39, __p3_39), __rev0_39, __p1_39); \
+  __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 1, 0); \
+  __ret_39; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s64(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \
+  int64x1_t __s0_40 = __p0_40; \
+  int64x1_t __s2_40 = __p2_40; \
+  int64x1_t __ret_40; \
+  __ret_40 = vset_lane_s64(vget_lane_s64(__s2_40, __p3_40), __s0_40, __p1_40); \
+  __ret_40; \
+})
+#else
+#define vcopy_lane_s64(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \
+  int64x1_t __s0_41 = __p0_41; \
+  int64x1_t __s2_41 = __p2_41; \
+  int64x1_t __ret_41; \
+  __ret_41 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_41, __p3_41), __s0_41, __p1_41); \
+  __ret_41; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s16(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \
+  int16x4_t __s0_42 = __p0_42; \
+  int16x4_t __s2_42 = __p2_42; \
+  int16x4_t __ret_42; \
+  __ret_42 = vset_lane_s16(vget_lane_s16(__s2_42, __p3_42), __s0_42, __p1_42); \
+  __ret_42; \
+})
+#else
+#define vcopy_lane_s16(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \
+  int16x4_t __s0_43 = __p0_43; \
+  int16x4_t __s2_43 = __p2_43; \
+  int16x4_t __rev0_43;  __rev0_43 = __builtin_shufflevector(__s0_43, __s0_43, 3, 2, 1, 0); \
+  int16x4_t __rev2_43;  __rev2_43 = __builtin_shufflevector(__s2_43, __s2_43, 3, 2, 1, 0); \
+  int16x4_t __ret_43; \
+  __ret_43 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_43, __p3_43), __rev0_43, __p1_43); \
+  __ret_43 = __builtin_shufflevector(__ret_43, __ret_43, 3, 2, 1, 0); \
+  __ret_43; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_p8(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \
+  poly8x16_t __s0_44 = __p0_44; \
+  poly8x16_t __s2_44 = __p2_44; \
+  poly8x16_t __ret_44; \
+  __ret_44 = vsetq_lane_p8(vgetq_lane_p8(__s2_44, __p3_44), __s0_44, __p1_44); \
+  __ret_44; \
+})
+#else
+#define vcopyq_laneq_p8(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \
+  poly8x16_t __s0_45 = __p0_45; \
+  poly8x16_t __s2_45 = __p2_45; \
+  poly8x16_t __rev0_45;  __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_45;  __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_45; \
+  __ret_45 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_45, __p3_45), __rev0_45, __p1_45); \
+  __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_45; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_p16(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \
+  poly16x8_t __s0_46 = __p0_46; \
+  poly16x8_t __s2_46 = __p2_46; \
+  poly16x8_t __ret_46; \
+  __ret_46 = vsetq_lane_p16(vgetq_lane_p16(__s2_46, __p3_46), __s0_46, __p1_46); \
+  __ret_46; \
+})
+#else
+#define vcopyq_laneq_p16(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \
+  poly16x8_t __s0_47 = __p0_47; \
+  poly16x8_t __s2_47 = __p2_47; \
+  poly16x8_t __rev0_47;  __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev2_47;  __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret_47; \
+  __ret_47 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_47, __p3_47), __rev0_47, __p1_47); \
+  __ret_47 = __builtin_shufflevector(__ret_47, __ret_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_47; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u8(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \
+  uint8x16_t __s0_48 = __p0_48; \
+  uint8x16_t __s2_48 = __p2_48; \
+  uint8x16_t __ret_48; \
+  __ret_48 = vsetq_lane_u8(vgetq_lane_u8(__s2_48, __p3_48), __s0_48, __p1_48); \
+  __ret_48; \
+})
+#else
+#define vcopyq_laneq_u8(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \
+  uint8x16_t __s0_49 = __p0_49; \
+  uint8x16_t __s2_49 = __p2_49; \
+  uint8x16_t __rev0_49;  __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_49;  __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_49; \
+  __ret_49 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_49, __p3_49), __rev0_49, __p1_49); \
+  __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_49; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \
+  uint32x4_t __s0_50 = __p0_50; \
+  uint32x4_t __s2_50 = __p2_50; \
+  uint32x4_t __ret_50; \
+  __ret_50 = vsetq_lane_u32(vgetq_lane_u32(__s2_50, __p3_50), __s0_50, __p1_50); \
+  __ret_50; \
+})
+#else
+#define vcopyq_laneq_u32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \
+  uint32x4_t __s0_51 = __p0_51; \
+  uint32x4_t __s2_51 = __p2_51; \
+  uint32x4_t __rev0_51;  __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 3, 2, 1, 0); \
+  uint32x4_t __rev2_51;  __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 3, 2, 1, 0); \
+  uint32x4_t __ret_51; \
+  __ret_51 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_51, __p3_51), __rev0_51, __p1_51); \
+  __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 3, 2, 1, 0); \
+  __ret_51; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \
+  uint64x2_t __s0_52 = __p0_52; \
+  uint64x2_t __s2_52 = __p2_52; \
+  uint64x2_t __ret_52; \
+  __ret_52 = vsetq_lane_u64(vgetq_lane_u64(__s2_52, __p3_52), __s0_52, __p1_52); \
+  __ret_52; \
+})
+#else
+#define vcopyq_laneq_u64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \
+  uint64x2_t __s0_53 = __p0_53; \
+  uint64x2_t __s2_53 = __p2_53; \
+  uint64x2_t __rev0_53;  __rev0_53 = __builtin_shufflevector(__s0_53, __s0_53, 1, 0); \
+  uint64x2_t __rev2_53;  __rev2_53 = __builtin_shufflevector(__s2_53, __s2_53, 1, 0); \
+  uint64x2_t __ret_53; \
+  __ret_53 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_53, __p3_53), __rev0_53, __p1_53); \
+  __ret_53 = __builtin_shufflevector(__ret_53, __ret_53, 1, 0); \
+  __ret_53; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \
+  uint16x8_t __s0_54 = __p0_54; \
+  uint16x8_t __s2_54 = __p2_54; \
+  uint16x8_t __ret_54; \
+  __ret_54 = vsetq_lane_u16(vgetq_lane_u16(__s2_54, __p3_54), __s0_54, __p1_54); \
+  __ret_54; \
+})
+#else
+#define vcopyq_laneq_u16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \
+  uint16x8_t __s0_55 = __p0_55; \
+  uint16x8_t __s2_55 = __p2_55; \
+  uint16x8_t __rev0_55;  __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_55;  __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_55; \
+  __ret_55 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_55, __p3_55), __rev0_55, __p1_55); \
+  __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_55; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \
+  int8x16_t __s0_56 = __p0_56; \
+  int8x16_t __s2_56 = __p2_56; \
+  int8x16_t __ret_56; \
+  __ret_56 = vsetq_lane_s8(vgetq_lane_s8(__s2_56, __p3_56), __s0_56, __p1_56); \
+  __ret_56; \
+})
+#else
+#define vcopyq_laneq_s8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \
+  int8x16_t __s0_57 = __p0_57; \
+  int8x16_t __s2_57 = __p2_57; \
+  int8x16_t __rev0_57;  __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_57;  __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_57; \
+  __ret_57 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_57, __p3_57), __rev0_57, __p1_57); \
+  __ret_57 = __builtin_shufflevector(__ret_57, __ret_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_57; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_f32(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \
+  float32x4_t __s0_58 = __p0_58; \
+  float32x4_t __s2_58 = __p2_58; \
+  float32x4_t __ret_58; \
+  __ret_58 = vsetq_lane_f32(vgetq_lane_f32(__s2_58, __p3_58), __s0_58, __p1_58); \
+  __ret_58; \
+})
+#else
+#define vcopyq_laneq_f32(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \
+  float32x4_t __s0_59 = __p0_59; \
+  float32x4_t __s2_59 = __p2_59; \
+  float32x4_t __rev0_59;  __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 3, 2, 1, 0); \
+  float32x4_t __rev2_59;  __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 3, 2, 1, 0); \
+  float32x4_t __ret_59; \
+  __ret_59 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_59, __p3_59), __rev0_59, __p1_59); \
+  __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 3, 2, 1, 0); \
+  __ret_59; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s32(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \
+  int32x4_t __s0_60 = __p0_60; \
+  int32x4_t __s2_60 = __p2_60; \
+  int32x4_t __ret_60; \
+  __ret_60 = vsetq_lane_s32(vgetq_lane_s32(__s2_60, __p3_60), __s0_60, __p1_60); \
+  __ret_60; \
+})
+#else
+#define vcopyq_laneq_s32(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \
+  int32x4_t __s0_61 = __p0_61; \
+  int32x4_t __s2_61 = __p2_61; \
+  int32x4_t __rev0_61;  __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 3, 2, 1, 0); \
+  int32x4_t __rev2_61;  __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 3, 2, 1, 0); \
+  int32x4_t __ret_61; \
+  __ret_61 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_61, __p3_61), __rev0_61, __p1_61); \
+  __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 3, 2, 1, 0); \
+  __ret_61; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s64(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \
+  int64x2_t __s0_62 = __p0_62; \
+  int64x2_t __s2_62 = __p2_62; \
+  int64x2_t __ret_62; \
+  __ret_62 = vsetq_lane_s64(vgetq_lane_s64(__s2_62, __p3_62), __s0_62, __p1_62); \
+  __ret_62; \
+})
+#else
+#define vcopyq_laneq_s64(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \
+  int64x2_t __s0_63 = __p0_63; \
+  int64x2_t __s2_63 = __p2_63; \
+  int64x2_t __rev0_63;  __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 1, 0); \
+  int64x2_t __rev2_63;  __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 1, 0); \
+  int64x2_t __ret_63; \
+  __ret_63 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_63, __p3_63), __rev0_63, __p1_63); \
+  __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 1, 0); \
+  __ret_63; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s16(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \
+  int16x8_t __s0_64 = __p0_64; \
+  int16x8_t __s2_64 = __p2_64; \
+  int16x8_t __ret_64; \
+  __ret_64 = vsetq_lane_s16(vgetq_lane_s16(__s2_64, __p3_64), __s0_64, __p1_64); \
+  __ret_64; \
+})
+#else
+#define vcopyq_laneq_s16(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \
+  int16x8_t __s0_65 = __p0_65; \
+  int16x8_t __s2_65 = __p2_65; \
+  int16x8_t __rev0_65;  __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_65;  __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_65; \
+  __ret_65 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_65, __p3_65), __rev0_65, __p1_65); \
+  __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_65; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_p8(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \
+  poly8x8_t __s0_66 = __p0_66; \
+  poly8x16_t __s2_66 = __p2_66; \
+  poly8x8_t __ret_66; \
+  __ret_66 = vset_lane_p8(vgetq_lane_p8(__s2_66, __p3_66), __s0_66, __p1_66); \
+  __ret_66; \
+})
+#else
+#define vcopy_laneq_p8(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \
+  poly8x8_t __s0_67 = __p0_67; \
+  poly8x16_t __s2_67 = __p2_67; \
+  poly8x8_t __rev0_67;  __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_67;  __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_67; \
+  __ret_67 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_67, __p3_67), __rev0_67, __p1_67); \
+  __ret_67 = __builtin_shufflevector(__ret_67, __ret_67, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_67; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_p16(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \
+  poly16x4_t __s0_68 = __p0_68; \
+  poly16x8_t __s2_68 = __p2_68; \
+  poly16x4_t __ret_68; \
+  __ret_68 = vset_lane_p16(vgetq_lane_p16(__s2_68, __p3_68), __s0_68, __p1_68); \
+  __ret_68; \
+})
+#else
+#define vcopy_laneq_p16(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \
+  poly16x4_t __s0_69 = __p0_69; \
+  poly16x8_t __s2_69 = __p2_69; \
+  poly16x4_t __rev0_69;  __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 3, 2, 1, 0); \
+  poly16x8_t __rev2_69;  __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __ret_69; \
+  __ret_69 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_69, __p3_69), __rev0_69, __p1_69); \
+  __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 3, 2, 1, 0); \
+  __ret_69; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u8(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \
+  uint8x8_t __s0_70 = __p0_70; \
+  uint8x16_t __s2_70 = __p2_70; \
+  uint8x8_t __ret_70; \
+  __ret_70 = vset_lane_u8(vgetq_lane_u8(__s2_70, __p3_70), __s0_70, __p1_70); \
+  __ret_70; \
+})
+#else
+#define vcopy_laneq_u8(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \
+  uint8x8_t __s0_71 = __p0_71; \
+  uint8x16_t __s2_71 = __p2_71; \
+  uint8x8_t __rev0_71;  __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_71;  __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_71; \
+  __ret_71 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_71, __p3_71), __rev0_71, __p1_71); \
+  __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_71; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \
+  uint32x2_t __s0_72 = __p0_72; \
+  uint32x4_t __s2_72 = __p2_72; \
+  uint32x2_t __ret_72; \
+  __ret_72 = vset_lane_u32(vgetq_lane_u32(__s2_72, __p3_72), __s0_72, __p1_72); \
+  __ret_72; \
+})
+#else
+#define vcopy_laneq_u32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \
+  uint32x2_t __s0_73 = __p0_73; \
+  uint32x4_t __s2_73 = __p2_73; \
+  uint32x2_t __rev0_73;  __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 1, 0); \
+  uint32x4_t __rev2_73;  __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \
+  uint32x2_t __ret_73; \
+  __ret_73 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_73, __p3_73), __rev0_73, __p1_73); \
+  __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 1, 0); \
+  __ret_73; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \
+  uint64x1_t __s0_74 = __p0_74; \
+  uint64x2_t __s2_74 = __p2_74; \
+  uint64x1_t __ret_74; \
+  __ret_74 = vset_lane_u64(vgetq_lane_u64(__s2_74, __p3_74), __s0_74, __p1_74); \
+  __ret_74; \
+})
+#else
+#define vcopy_laneq_u64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \
+  uint64x1_t __s0_75 = __p0_75; \
+  uint64x2_t __s2_75 = __p2_75; \
+  uint64x2_t __rev2_75;  __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \
+  uint64x1_t __ret_75; \
+  __ret_75 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_75, __p3_75), __s0_75, __p1_75); \
+  __ret_75; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \
+  uint16x4_t __s0_76 = __p0_76; \
+  uint16x8_t __s2_76 = __p2_76; \
+  uint16x4_t __ret_76; \
+  __ret_76 = vset_lane_u16(vgetq_lane_u16(__s2_76, __p3_76), __s0_76, __p1_76); \
+  __ret_76; \
+})
+#else
+#define vcopy_laneq_u16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \
+  uint16x4_t __s0_77 = __p0_77; \
+  uint16x8_t __s2_77 = __p2_77; \
+  uint16x4_t __rev0_77;  __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 3, 2, 1, 0); \
+  uint16x8_t __rev2_77;  __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_77; \
+  __ret_77 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_77, __p3_77), __rev0_77, __p1_77); \
+  __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 3, 2, 1, 0); \
+  __ret_77; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \
+  int8x8_t __s0_78 = __p0_78; \
+  int8x16_t __s2_78 = __p2_78; \
+  int8x8_t __ret_78; \
+  __ret_78 = vset_lane_s8(vgetq_lane_s8(__s2_78, __p3_78), __s0_78, __p1_78); \
+  __ret_78; \
+})
+#else
+#define vcopy_laneq_s8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \
+  int8x8_t __s0_79 = __p0_79; \
+  int8x16_t __s2_79 = __p2_79; \
+  int8x8_t __rev0_79;  __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_79;  __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_79; \
+  __ret_79 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_79, __p3_79), __rev0_79, __p1_79); \
+  __ret_79 = __builtin_shufflevector(__ret_79, __ret_79, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_79; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_f32(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \
+  float32x2_t __s0_80 = __p0_80; \
+  float32x4_t __s2_80 = __p2_80; \
+  float32x2_t __ret_80; \
+  __ret_80 = vset_lane_f32(vgetq_lane_f32(__s2_80, __p3_80), __s0_80, __p1_80); \
+  __ret_80; \
+})
+#else
+#define vcopy_laneq_f32(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \
+  float32x2_t __s0_81 = __p0_81; \
+  float32x4_t __s2_81 = __p2_81; \
+  float32x2_t __rev0_81;  __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 1, 0); \
+  float32x4_t __rev2_81;  __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 3, 2, 1, 0); \
+  float32x2_t __ret_81; \
+  __ret_81 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_81, __p3_81), __rev0_81, __p1_81); \
+  __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 1, 0); \
+  __ret_81; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s32(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \
+  int32x2_t __s0_82 = __p0_82; \
+  int32x4_t __s2_82 = __p2_82; \
+  int32x2_t __ret_82; \
+  __ret_82 = vset_lane_s32(vgetq_lane_s32(__s2_82, __p3_82), __s0_82, __p1_82); \
+  __ret_82; \
+})
+#else
+#define vcopy_laneq_s32(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \
+  int32x2_t __s0_83 = __p0_83; \
+  int32x4_t __s2_83 = __p2_83; \
+  int32x2_t __rev0_83;  __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 1, 0); \
+  int32x4_t __rev2_83;  __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 3, 2, 1, 0); \
+  int32x2_t __ret_83; \
+  __ret_83 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_83, __p3_83), __rev0_83, __p1_83); \
+  __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 1, 0); \
+  __ret_83; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s64(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \
+  int64x1_t __s0_84 = __p0_84; \
+  int64x2_t __s2_84 = __p2_84; \
+  int64x1_t __ret_84; \
+  __ret_84 = vset_lane_s64(vgetq_lane_s64(__s2_84, __p3_84), __s0_84, __p1_84); \
+  __ret_84; \
+})
+#else
+#define vcopy_laneq_s64(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \
+  int64x1_t __s0_85 = __p0_85; \
+  int64x2_t __s2_85 = __p2_85; \
+  int64x2_t __rev2_85;  __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 1, 0); \
+  int64x1_t __ret_85; \
+  __ret_85 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_85, __p3_85), __s0_85, __p1_85); \
+  __ret_85; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s16(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \
+  int16x4_t __s0_86 = __p0_86; \
+  int16x8_t __s2_86 = __p2_86; \
+  int16x4_t __ret_86; \
+  __ret_86 = vset_lane_s16(vgetq_lane_s16(__s2_86, __p3_86), __s0_86, __p1_86); \
+  __ret_86; \
+})
+#else
+#define vcopy_laneq_s16(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \
+  int16x4_t __s0_87 = __p0_87; \
+  int16x8_t __s2_87 = __p2_87; \
+  int16x4_t __rev0_87;  __rev0_87 = __builtin_shufflevector(__s0_87, __s0_87, 3, 2, 1, 0); \
+  int16x8_t __rev2_87;  __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_87; \
+  __ret_87 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_87, __p3_87), __rev0_87, __p1_87); \
+  __ret_87 = __builtin_shufflevector(__ret_87, __ret_87, 3, 2, 1, 0); \
+  __ret_87; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vcreate_p64(uint64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vcreate_p64(uint64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vcreate_f64(uint64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vcreate_f64(uint64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vcvts_f32_s32(int32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_s32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vcvts_f32_s32(int32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vcvts_f32_u32(uint32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_u32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vcvts_f32_u32(uint32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvt_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvt_f32_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcvt_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__p0, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vcvtd_f64_s64(int64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_s64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vcvtd_f64_s64(int64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vcvtd_f64_u64(uint64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_u64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vcvtd_f64_u64(uint64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_u64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvtq_f64_u64(uint64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai float64x2_t vcvtq_f64_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvtq_f64_s64(int64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai float64x2_t vcvtq_f64_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vcvt_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai float64x1_t vcvt_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vcvt_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai float64x1_t vcvt_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvt_f64_f32(float32x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vcvt_f64_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vcvt_f64_f32(float32x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__p0, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcvt_high_f16_f32(float16x4_t __p0, float32x4_t __p1) {
+  float16x8_t __ret;
+  __ret = vcombine_f16(__p0, vcvt_f16_f32(__p1));
+  return __ret;
+}
+#else
+__ai float16x8_t vcvt_high_f16_f32(float16x4_t __p0, float32x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __noswap_vcombine_f16(__rev0, __noswap_vcvt_f16_f32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvt_high_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = vcvt_f32_f16(vget_high_f16(__p0));
+  return __ret;
+}
+#else
+__ai float32x4_t vcvt_high_f32_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vcvt_f32_f16(__noswap_vget_high_f16(__rev0));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvt_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x4_t __ret;
+  __ret = vcombine_f32(__p0, vcvt_f32_f64(__p1));
+  return __ret;
+}
+#else
+__ai float32x4_t vcvt_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vcombine_f32(__rev0, __noswap_vcvt_f32_f64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvt_high_f64_f32(float32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = vcvt_f64_f32(vget_high_f32(__p0));
+  return __ret;
+}
+#else
+__ai float64x2_t vcvt_high_f64_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vcvt_f64_f32(__noswap_vget_high_f32(__rev0));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vcvt_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vcvt_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvts_n_s32_f32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvts_n_s32_f32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vcvtq_n_s64_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vcvtq_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vcvtq_n_s64_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vcvt_n_s64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vcvt_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vcvt_n_s64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtd_n_s64_f64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtd_n_s64_f64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvts_n_u32_f32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvts_n_u32_f32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_n_u64_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vcvtq_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_n_u64_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vcvt_n_u64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vcvt_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vcvt_n_u64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtd_n_u64_f64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtd_n_u64_f64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvts_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvts_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvts_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvts_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvt_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvt_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvt_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvt_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvts_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvts_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvts_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvts_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvt_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvt_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvt_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvt_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtas_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtas_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtas_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtas_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtad_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtad_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtad_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtad_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtas_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtas_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtas_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtas_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtad_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtad_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtad_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtad_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtms_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtms_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtms_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtms_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtmd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtmd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtmd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtmd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtms_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtms_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtms_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtms_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtmd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtmd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtmd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtmd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtns_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtns_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtns_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtns_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtnd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtnd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtnd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtnd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtns_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtns_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtns_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtns_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtnd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtnd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtnd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtnd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtps_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtps_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtps_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtps_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtpd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtpd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtpd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtpd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtps_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtps_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtps_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtps_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtpd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtpd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtpd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtpd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vcvtxd_f32_f64(float64_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvtxd_f32_f64(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vcvtxd_f32_f64(float64_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvtxd_f32_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvtx_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvtx_f32_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcvtx_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvtx_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x4_t __ret;
+  __ret = vcombine_f32(__p0, vcvtx_f32_f64(__p1));
+  return __ret;
+}
+#else
+__ai float32x4_t vcvtx_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vcombine_f32(__rev0, __noswap_vcvtx_f32_f64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vdivq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vdivq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vdivq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vdivq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vdiv_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vdiv_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_lane_f32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_lane_f32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_laneq_f64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_laneq_f64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_laneq_f32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_laneq_f32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vdup_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai poly64x1_t vdup_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vdupq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai poly64x2_t vdupq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vdupq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float64x2_t vdupq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vdup_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai float64x1_t vdup_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vext_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vextq_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 42); \
+  __ret; \
+})
+#else
+#define vextq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vext_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+__ai float64x1_t __noswap_vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmad_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_lane_f64(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmad_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_lane_f64(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmad_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_lane_f64(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 42); \
+  __ret; \
+})
+#else
+#define vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__s2, __p3, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 42); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 41); \
+  __ret; \
+})
+#else
+#define vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, __p3, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 41); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 10); \
+  __ret; \
+})
+#else
+#define vfma_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 10); \
+  __ret; \
+})
+#define __noswap_vfma_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 9); \
+  __ret; \
+})
+#else
+#define vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, __p3, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 9); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 42); \
+  __ret; \
+})
+#else
+#define vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 42); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 41); \
+  __ret; \
+})
+#else
+#define vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 41); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 10); \
+  __ret; \
+})
+#else
+#define vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__rev2, __p3, 10); \
+  __ret; \
+})
+#define __noswap_vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 9); \
+  __ret; \
+})
+#else
+#define vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x16_t)__rev2, __p3, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 9); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = vfmaq_f64(__p0, __p1, (float64x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float64x2_t vfmaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vfmaq_f64(__rev0, __rev1, (float64x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = vfmaq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vfmaq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = vfma_f32(__p0, __p1, (float32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __noswap_vfma_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = vfmaq_f64(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float64x2_t vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = vfma_f64(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __noswap_vfma_f64(__p0, -__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsd_lane_f64(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \
+  float64_t __s0_88 = __p0_88; \
+  float64_t __s1_88 = __p1_88; \
+  float64x1_t __s2_88 = __p2_88; \
+  float64_t __ret_88; \
+  __ret_88 = vfmad_lane_f64(__s0_88, -__s1_88, __s2_88, __p3_88); \
+  __ret_88; \
+})
+#else
+#define vfmsd_lane_f64(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \
+  float64_t __s0_89 = __p0_89; \
+  float64_t __s1_89 = __p1_89; \
+  float64x1_t __s2_89 = __p2_89; \
+  float64_t __ret_89; \
+  __ret_89 = __noswap_vfmad_lane_f64(__s0_89, -__s1_89, __s2_89, __p3_89); \
+  __ret_89; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmss_lane_f32(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \
+  float32_t __s0_90 = __p0_90; \
+  float32_t __s1_90 = __p1_90; \
+  float32x2_t __s2_90 = __p2_90; \
+  float32_t __ret_90; \
+  __ret_90 = vfmas_lane_f32(__s0_90, -__s1_90, __s2_90, __p3_90); \
+  __ret_90; \
+})
+#else
+#define vfmss_lane_f32(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \
+  float32_t __s0_91 = __p0_91; \
+  float32_t __s1_91 = __p1_91; \
+  float32x2_t __s2_91 = __p2_91; \
+  float32x2_t __rev2_91;  __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 1, 0); \
+  float32_t __ret_91; \
+  __ret_91 = __noswap_vfmas_lane_f32(__s0_91, -__s1_91, __rev2_91, __p3_91); \
+  __ret_91; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_lane_f64(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \
+  float64x2_t __s0_92 = __p0_92; \
+  float64x2_t __s1_92 = __p1_92; \
+  float64x1_t __s2_92 = __p2_92; \
+  float64x2_t __ret_92; \
+  __ret_92 = vfmaq_lane_f64(__s0_92, -__s1_92, __s2_92, __p3_92); \
+  __ret_92; \
+})
+#else
+#define vfmsq_lane_f64(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \
+  float64x2_t __s0_93 = __p0_93; \
+  float64x2_t __s1_93 = __p1_93; \
+  float64x1_t __s2_93 = __p2_93; \
+  float64x2_t __rev0_93;  __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \
+  float64x2_t __rev1_93;  __rev1_93 = __builtin_shufflevector(__s1_93, __s1_93, 1, 0); \
+  float64x2_t __ret_93; \
+  __ret_93 = __noswap_vfmaq_lane_f64(__rev0_93, -__rev1_93, __s2_93, __p3_93); \
+  __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \
+  __ret_93; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_lane_f32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \
+  float32x4_t __s0_94 = __p0_94; \
+  float32x4_t __s1_94 = __p1_94; \
+  float32x2_t __s2_94 = __p2_94; \
+  float32x4_t __ret_94; \
+  __ret_94 = vfmaq_lane_f32(__s0_94, -__s1_94, __s2_94, __p3_94); \
+  __ret_94; \
+})
+#else
+#define vfmsq_lane_f32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \
+  float32x4_t __s0_95 = __p0_95; \
+  float32x4_t __s1_95 = __p1_95; \
+  float32x2_t __s2_95 = __p2_95; \
+  float32x4_t __rev0_95;  __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 3, 2, 1, 0); \
+  float32x4_t __rev1_95;  __rev1_95 = __builtin_shufflevector(__s1_95, __s1_95, 3, 2, 1, 0); \
+  float32x2_t __rev2_95;  __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 1, 0); \
+  float32x4_t __ret_95; \
+  __ret_95 = __noswap_vfmaq_lane_f32(__rev0_95, -__rev1_95, __rev2_95, __p3_95); \
+  __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 3, 2, 1, 0); \
+  __ret_95; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_lane_f64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \
+  float64x1_t __s0_96 = __p0_96; \
+  float64x1_t __s1_96 = __p1_96; \
+  float64x1_t __s2_96 = __p2_96; \
+  float64x1_t __ret_96; \
+  __ret_96 = vfma_lane_f64(__s0_96, -__s1_96, __s2_96, __p3_96); \
+  __ret_96; \
+})
+#else
+#define vfms_lane_f64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \
+  float64x1_t __s0_97 = __p0_97; \
+  float64x1_t __s1_97 = __p1_97; \
+  float64x1_t __s2_97 = __p2_97; \
+  float64x1_t __ret_97; \
+  __ret_97 = __noswap_vfma_lane_f64(__s0_97, -__s1_97, __s2_97, __p3_97); \
+  __ret_97; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_lane_f32(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \
+  float32x2_t __s0_98 = __p0_98; \
+  float32x2_t __s1_98 = __p1_98; \
+  float32x2_t __s2_98 = __p2_98; \
+  float32x2_t __ret_98; \
+  __ret_98 = vfma_lane_f32(__s0_98, -__s1_98, __s2_98, __p3_98); \
+  __ret_98; \
+})
+#else
+#define vfms_lane_f32(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \
+  float32x2_t __s0_99 = __p0_99; \
+  float32x2_t __s1_99 = __p1_99; \
+  float32x2_t __s2_99 = __p2_99; \
+  float32x2_t __rev0_99;  __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 1, 0); \
+  float32x2_t __rev1_99;  __rev1_99 = __builtin_shufflevector(__s1_99, __s1_99, 1, 0); \
+  float32x2_t __rev2_99;  __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 1, 0); \
+  float32x2_t __ret_99; \
+  __ret_99 = __noswap_vfma_lane_f32(__rev0_99, -__rev1_99, __rev2_99, __p3_99); \
+  __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 1, 0); \
+  __ret_99; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsd_laneq_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \
+  float64_t __s0_100 = __p0_100; \
+  float64_t __s1_100 = __p1_100; \
+  float64x2_t __s2_100 = __p2_100; \
+  float64_t __ret_100; \
+  __ret_100 = vfmad_laneq_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \
+  __ret_100; \
+})
+#else
+#define vfmsd_laneq_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \
+  float64_t __s0_101 = __p0_101; \
+  float64_t __s1_101 = __p1_101; \
+  float64x2_t __s2_101 = __p2_101; \
+  float64x2_t __rev2_101;  __rev2_101 = __builtin_shufflevector(__s2_101, __s2_101, 1, 0); \
+  float64_t __ret_101; \
+  __ret_101 = __noswap_vfmad_laneq_f64(__s0_101, -__s1_101, __rev2_101, __p3_101); \
+  __ret_101; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmss_laneq_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \
+  float32_t __s0_102 = __p0_102; \
+  float32_t __s1_102 = __p1_102; \
+  float32x4_t __s2_102 = __p2_102; \
+  float32_t __ret_102; \
+  __ret_102 = vfmas_laneq_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \
+  __ret_102; \
+})
+#else
+#define vfmss_laneq_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \
+  float32_t __s0_103 = __p0_103; \
+  float32_t __s1_103 = __p1_103; \
+  float32x4_t __s2_103 = __p2_103; \
+  float32x4_t __rev2_103;  __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 3, 2, 1, 0); \
+  float32_t __ret_103; \
+  __ret_103 = __noswap_vfmas_laneq_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \
+  __ret_103; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_laneq_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \
+  float64x2_t __s0_104 = __p0_104; \
+  float64x2_t __s1_104 = __p1_104; \
+  float64x2_t __s2_104 = __p2_104; \
+  float64x2_t __ret_104; \
+  __ret_104 = vfmaq_laneq_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \
+  __ret_104; \
+})
+#else
+#define vfmsq_laneq_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \
+  float64x2_t __s0_105 = __p0_105; \
+  float64x2_t __s1_105 = __p1_105; \
+  float64x2_t __s2_105 = __p2_105; \
+  float64x2_t __rev0_105;  __rev0_105 = __builtin_shufflevector(__s0_105, __s0_105, 1, 0); \
+  float64x2_t __rev1_105;  __rev1_105 = __builtin_shufflevector(__s1_105, __s1_105, 1, 0); \
+  float64x2_t __rev2_105;  __rev2_105 = __builtin_shufflevector(__s2_105, __s2_105, 1, 0); \
+  float64x2_t __ret_105; \
+  __ret_105 = __noswap_vfmaq_laneq_f64(__rev0_105, -__rev1_105, __rev2_105, __p3_105); \
+  __ret_105 = __builtin_shufflevector(__ret_105, __ret_105, 1, 0); \
+  __ret_105; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_laneq_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \
+  float32x4_t __s0_106 = __p0_106; \
+  float32x4_t __s1_106 = __p1_106; \
+  float32x4_t __s2_106 = __p2_106; \
+  float32x4_t __ret_106; \
+  __ret_106 = vfmaq_laneq_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \
+  __ret_106; \
+})
+#else
+#define vfmsq_laneq_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \
+  float32x4_t __s0_107 = __p0_107; \
+  float32x4_t __s1_107 = __p1_107; \
+  float32x4_t __s2_107 = __p2_107; \
+  float32x4_t __rev0_107;  __rev0_107 = __builtin_shufflevector(__s0_107, __s0_107, 3, 2, 1, 0); \
+  float32x4_t __rev1_107;  __rev1_107 = __builtin_shufflevector(__s1_107, __s1_107, 3, 2, 1, 0); \
+  float32x4_t __rev2_107;  __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 3, 2, 1, 0); \
+  float32x4_t __ret_107; \
+  __ret_107 = __noswap_vfmaq_laneq_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \
+  __ret_107 = __builtin_shufflevector(__ret_107, __ret_107, 3, 2, 1, 0); \
+  __ret_107; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_laneq_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \
+  float64x1_t __s0_108 = __p0_108; \
+  float64x1_t __s1_108 = __p1_108; \
+  float64x2_t __s2_108 = __p2_108; \
+  float64x1_t __ret_108; \
+  __ret_108 = vfma_laneq_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \
+  __ret_108; \
+})
+#else
+#define vfms_laneq_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \
+  float64x1_t __s0_109 = __p0_109; \
+  float64x1_t __s1_109 = __p1_109; \
+  float64x2_t __s2_109 = __p2_109; \
+  float64x2_t __rev2_109;  __rev2_109 = __builtin_shufflevector(__s2_109, __s2_109, 1, 0); \
+  float64x1_t __ret_109; \
+  __ret_109 = __noswap_vfma_laneq_f64(__s0_109, -__s1_109, __rev2_109, __p3_109); \
+  __ret_109; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_laneq_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \
+  float32x2_t __s0_110 = __p0_110; \
+  float32x2_t __s1_110 = __p1_110; \
+  float32x4_t __s2_110 = __p2_110; \
+  float32x2_t __ret_110; \
+  __ret_110 = vfma_laneq_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \
+  __ret_110; \
+})
+#else
+#define vfms_laneq_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \
+  float32x2_t __s0_111 = __p0_111; \
+  float32x2_t __s1_111 = __p1_111; \
+  float32x4_t __s2_111 = __p2_111; \
+  float32x2_t __rev0_111;  __rev0_111 = __builtin_shufflevector(__s0_111, __s0_111, 1, 0); \
+  float32x2_t __rev1_111;  __rev1_111 = __builtin_shufflevector(__s1_111, __s1_111, 1, 0); \
+  float32x4_t __rev2_111;  __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 3, 2, 1, 0); \
+  float32x2_t __ret_111; \
+  __ret_111 = __noswap_vfma_laneq_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \
+  __ret_111 = __builtin_shufflevector(__ret_111, __ret_111, 1, 0); \
+  __ret_111; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = vfmaq_f64(__p0, -__p1, (float64x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, (float64x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = vfmaq_f32(__p0, -__p1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = vfma_f32(__p0, -__p1, (float32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __noswap_vfma_f32(__rev0, -__rev1, (float32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vget_high_p64(poly64x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai poly64x1_t vget_high_p64(poly64x2_t __p0) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+__ai poly64x1_t __noswap_vget_high_p64(poly64x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vget_high_f64(float64x2_t __p0) {
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai float64x1_t vget_high_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vget_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vget_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vget_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vget_low_p64(poly64x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vget_low_p64(poly64x2_t __p0) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vget_low_f64(float64x2_t __p0) {
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai float64x1_t vget_low_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_v(__p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_v(__p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_v(__p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_v(__p0, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_v(__p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_v(__p0, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_v(__p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_v(__p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_dup_v(__p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_dup_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_dup_v(__p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_dup_v(__p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_dup_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_dup_v(__p0, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_dup_v(__p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_dup_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_dup_v(__p0, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_dup_v(__p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_dup_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_dup_v(__p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vld1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vld1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 42); \
+  __ret; \
+})
+#else
+#define vld1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vld1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x2(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x2(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64_x2(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64_x2(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x2(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x2(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x2(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x2(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64_x2(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64_x2(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x2(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x2(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x2(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x2(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x2(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x2(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x2(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x2(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x2(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x2(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x2(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x2(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64_x2(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64_x2(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x2(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x2(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x2(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x2(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x2(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x2(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x2(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x2(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x2(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x2(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x2(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x2(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x2(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x2(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x2(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x2(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x2(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x2(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x2(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x2(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64_x2(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64_x2(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x2(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x2(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x2(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x2(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x2(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x2(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x2(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x2(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x2(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x2(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x3(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x3(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64_x3(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64_x3(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x3(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x3(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x3(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x3(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64_x3(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64_x3(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x3(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x3(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x3(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x3(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x3(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x3(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x3(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x3(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x3(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x3(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x3(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x3(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64_x3(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64_x3(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x3(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x3(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x3(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x3(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x3(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x3(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x3(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x3(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x3(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x3(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x3(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x3(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x3(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x3(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x3(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x3(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x3(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x3(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x3(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x3(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64_x3(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64_x3(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x3(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x3(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x3(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x3(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x3(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x3(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x3(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x3(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x3(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x3(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x4(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x4(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64_x4(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64_x4(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x4(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x4(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x4(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x4(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64_x4(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64_x4(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x4(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x4(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x4(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x4(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x4(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x4(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x4(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x4(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x4(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x4(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x4(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x4(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64_x4(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64_x4(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x4(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x4(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x4(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x4(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x4(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x4(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x4(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x4(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x4(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x4(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x4(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x4(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x4(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x4(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x4(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x4(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x4(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x4(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x4(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x4(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64_x4(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64_x4(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x4(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x4(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x4(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x4(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x4(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x4(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x4(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x4(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x4(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x4(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld2_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld2q_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld2q_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld2q_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld2q_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld2_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld2_dup_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld2_dup_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+  __ret; \
+})
+#else
+#define vld2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 36); \
+  __ret; \
+})
+#else
+#define vld2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 38); \
+  __ret; \
+})
+#else
+#define vld2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 48); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 51); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 32); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 42); \
+  __ret; \
+})
+#else
+#define vld2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 35); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+  __ret; \
+})
+#else
+#define vld2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 10); \
+  __ret; \
+})
+#else
+#define vld2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 3); \
+  __ret; \
+})
+#else
+#define vld2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld3_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld3q_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld3q_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld3q_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld3q_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld3_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld3_dup_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld3_dup_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+  __ret; \
+})
+#else
+#define vld3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 36); \
+  __ret; \
+})
+#else
+#define vld3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 38); \
+  __ret; \
+})
+#else
+#define vld3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 48); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 51); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 32); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 42); \
+  __ret; \
+})
+#else
+#define vld3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 35); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+  __ret; \
+})
+#else
+#define vld3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+  __ret; \
+})
+#else
+#define vld3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+  __ret; \
+})
+#else
+#define vld3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld4_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld4q_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld4q_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld4q_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld4q_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld4_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld4_dup_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld4_dup_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+  __ret; \
+})
+#else
+#define vld4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 36); \
+  __ret; \
+})
+#else
+#define vld4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 38); \
+  __ret; \
+})
+#else
+#define vld4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 48); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 51); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 32); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 42); \
+  __ret; \
+})
+#else
+#define vld4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 35); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+  __ret; \
+})
+#else
+#define vld4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+  __ret; \
+})
+#else
+#define vld4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+  __ret; \
+})
+#else
+#define vld4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vldrq_p128(__p0) __extension__ ({ \
+  poly128_t __ret; \
+  __ret = (poly128_t) __builtin_neon_vldrq_p128(__p0); \
+  __ret; \
+})
+#else
+#define vldrq_p128(__p0) __extension__ ({ \
+  poly128_t __ret; \
+  __ret = (poly128_t) __builtin_neon_vldrq_p128(__p0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmax_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmax_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vmaxnmvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxnmvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vmaxnmvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxnmvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxnmvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxnmvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxnmv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxnmv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vmaxvq_u8(uint8x16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vmaxvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vmaxvq_u32(uint32x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vmaxvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vmaxvq_u16(uint16x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vmaxvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vmaxvq_s8(int8x16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vmaxvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vmaxvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vmaxvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vmaxvq_s32(int32x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vmaxvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vmaxvq_s16(int16x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vmaxvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vmaxv_u8(uint8x8_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vmaxv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vmaxv_u32(uint32x2_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vmaxv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vmaxv_u16(uint16x4_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vmaxv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vmaxv_s8(int8x8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vmaxv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vmaxv_s32(int32x2_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vmaxv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vmaxv_s16(int16x4_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vmaxv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmin_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmin_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vminnmvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminnmvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vminnmvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminnmvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminnmvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminnmvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminnmv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminnmv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vminvq_u8(uint8x16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vminvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vminvq_u32(uint32x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vminvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vminvq_u16(uint16x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vminvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vminvq_s8(int8x16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vminvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vminvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vminvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vminvq_s32(int32x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vminvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vminvq_s16(int16x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vminvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vminv_u8(uint8x8_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vminv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vminv_u32(uint32x2_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vminv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vminv_u16(uint16x4_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vminv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vminv_s8(int8x8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vminv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vminv_s32(int32x2_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vminv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vminv_s16(int16x4_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vminv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x2_t vmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 + __p1 * (float64x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 + __rev1 * (float64x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x2_t vmlsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 - __p1 * (float64x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 - __rev1 * (float64x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vmov_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai poly64x1_t vmov_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vmovq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai poly64x2_t vmovq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmovq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float64x2_t vmovq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmov_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai float64x1_t vmov_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_112) {
+  uint16x8_t __ret_112;
+  uint8x8_t __a1_112 = vget_high_u8(__p0_112);
+  __ret_112 = (uint16x8_t)(vshll_n_u8(__a1_112, 0));
+  return __ret_112;
+}
+#else
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_113) {
+  uint8x16_t __rev0_113;  __rev0_113 = __builtin_shufflevector(__p0_113, __p0_113, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret_113;
+  uint8x8_t __a1_113 = __noswap_vget_high_u8(__rev0_113);
+  __ret_113 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_113, 0));
+  __ret_113 = __builtin_shufflevector(__ret_113, __ret_113, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret_113;
+}
+__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_114) {
+  uint16x8_t __ret_114;
+  uint8x8_t __a1_114 = __noswap_vget_high_u8(__p0_114);
+  __ret_114 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_114, 0));
+  return __ret_114;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_115) {
+  uint64x2_t __ret_115;
+  uint32x2_t __a1_115 = vget_high_u32(__p0_115);
+  __ret_115 = (uint64x2_t)(vshll_n_u32(__a1_115, 0));
+  return __ret_115;
+}
+#else
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_116) {
+  uint32x4_t __rev0_116;  __rev0_116 = __builtin_shufflevector(__p0_116, __p0_116, 3, 2, 1, 0);
+  uint64x2_t __ret_116;
+  uint32x2_t __a1_116 = __noswap_vget_high_u32(__rev0_116);
+  __ret_116 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_116, 0));
+  __ret_116 = __builtin_shufflevector(__ret_116, __ret_116, 1, 0);
+  return __ret_116;
+}
+__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_117) {
+  uint64x2_t __ret_117;
+  uint32x2_t __a1_117 = __noswap_vget_high_u32(__p0_117);
+  __ret_117 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_117, 0));
+  return __ret_117;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_118) {
+  uint32x4_t __ret_118;
+  uint16x4_t __a1_118 = vget_high_u16(__p0_118);
+  __ret_118 = (uint32x4_t)(vshll_n_u16(__a1_118, 0));
+  return __ret_118;
+}
+#else
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_119) {
+  uint16x8_t __rev0_119;  __rev0_119 = __builtin_shufflevector(__p0_119, __p0_119, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret_119;
+  uint16x4_t __a1_119 = __noswap_vget_high_u16(__rev0_119);
+  __ret_119 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_119, 0));
+  __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0);
+  return __ret_119;
+}
+__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_120) {
+  uint32x4_t __ret_120;
+  uint16x4_t __a1_120 = __noswap_vget_high_u16(__p0_120);
+  __ret_120 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_120, 0));
+  return __ret_120;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_121) {
+  int16x8_t __ret_121;
+  int8x8_t __a1_121 = vget_high_s8(__p0_121);
+  __ret_121 = (int16x8_t)(vshll_n_s8(__a1_121, 0));
+  return __ret_121;
+}
+#else
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_122) {
+  int8x16_t __rev0_122;  __rev0_122 = __builtin_shufflevector(__p0_122, __p0_122, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret_122;
+  int8x8_t __a1_122 = __noswap_vget_high_s8(__rev0_122);
+  __ret_122 = (int16x8_t)(__noswap_vshll_n_s8(__a1_122, 0));
+  __ret_122 = __builtin_shufflevector(__ret_122, __ret_122, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret_122;
+}
+__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_123) {
+  int16x8_t __ret_123;
+  int8x8_t __a1_123 = __noswap_vget_high_s8(__p0_123);
+  __ret_123 = (int16x8_t)(__noswap_vshll_n_s8(__a1_123, 0));
+  return __ret_123;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_124) {
+  int64x2_t __ret_124;
+  int32x2_t __a1_124 = vget_high_s32(__p0_124);
+  __ret_124 = (int64x2_t)(vshll_n_s32(__a1_124, 0));
+  return __ret_124;
+}
+#else
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_125) {
+  int32x4_t __rev0_125;  __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 3, 2, 1, 0);
+  int64x2_t __ret_125;
+  int32x2_t __a1_125 = __noswap_vget_high_s32(__rev0_125);
+  __ret_125 = (int64x2_t)(__noswap_vshll_n_s32(__a1_125, 0));
+  __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 1, 0);
+  return __ret_125;
+}
+__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_126) {
+  int64x2_t __ret_126;
+  int32x2_t __a1_126 = __noswap_vget_high_s32(__p0_126);
+  __ret_126 = (int64x2_t)(__noswap_vshll_n_s32(__a1_126, 0));
+  return __ret_126;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_127) {
+  int32x4_t __ret_127;
+  int16x4_t __a1_127 = vget_high_s16(__p0_127);
+  __ret_127 = (int32x4_t)(vshll_n_s16(__a1_127, 0));
+  return __ret_127;
+}
+#else
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_128) {
+  int16x8_t __rev0_128;  __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret_128;
+  int16x4_t __a1_128 = __noswap_vget_high_s16(__rev0_128);
+  __ret_128 = (int32x4_t)(__noswap_vshll_n_s16(__a1_128, 0));
+  __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 3, 2, 1, 0);
+  return __ret_128;
+}
+__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_129) {
+  int32x4_t __ret_129;
+  int16x4_t __a1_129 = __noswap_vget_high_s16(__p0_129);
+  __ret_129 = (int32x4_t)(__noswap_vshll_n_s16(__a1_129, 0));
+  return __ret_129;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vmovn_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vmovn_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vmovn_u64(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vmovn_u64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vmovn_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint8x16_t vmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vmovn_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vmovn_s32(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vmovn_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vmovn_s64(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vmovn_s64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vmovn_s16(__p1));
+  return __ret;
+}
+#else
+__ai int8x16_t vmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vmovn_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmulq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vmulq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuld_lane_f64(__p0_130, __p1_130, __p2_130) __extension__ ({ \
+  float64_t __s0_130 = __p0_130; \
+  float64x1_t __s1_130 = __p1_130; \
+  float64_t __ret_130; \
+  __ret_130 = __s0_130 * vget_lane_f64(__s1_130, __p2_130); \
+  __ret_130; \
+})
+#else
+#define vmuld_lane_f64(__p0_131, __p1_131, __p2_131) __extension__ ({ \
+  float64_t __s0_131 = __p0_131; \
+  float64x1_t __s1_131 = __p1_131; \
+  float64_t __ret_131; \
+  __ret_131 = __s0_131 * __noswap_vget_lane_f64(__s1_131, __p2_131); \
+  __ret_131; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuls_lane_f32(__p0_132, __p1_132, __p2_132) __extension__ ({ \
+  float32_t __s0_132 = __p0_132; \
+  float32x2_t __s1_132 = __p1_132; \
+  float32_t __ret_132; \
+  __ret_132 = __s0_132 * vget_lane_f32(__s1_132, __p2_132); \
+  __ret_132; \
+})
+#else
+#define vmuls_lane_f32(__p0_133, __p1_133, __p2_133) __extension__ ({ \
+  float32_t __s0_133 = __p0_133; \
+  float32x2_t __s1_133 = __p1_133; \
+  float32x2_t __rev1_133;  __rev1_133 = __builtin_shufflevector(__s1_133, __s1_133, 1, 0); \
+  float32_t __ret_133; \
+  __ret_133 = __s0_133 * __noswap_vget_lane_f32(__rev1_133, __p2_133); \
+  __ret_133; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vmul_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuld_laneq_f64(__p0_134, __p1_134, __p2_134) __extension__ ({ \
+  float64_t __s0_134 = __p0_134; \
+  float64x2_t __s1_134 = __p1_134; \
+  float64_t __ret_134; \
+  __ret_134 = __s0_134 * vgetq_lane_f64(__s1_134, __p2_134); \
+  __ret_134; \
+})
+#else
+#define vmuld_laneq_f64(__p0_135, __p1_135, __p2_135) __extension__ ({ \
+  float64_t __s0_135 = __p0_135; \
+  float64x2_t __s1_135 = __p1_135; \
+  float64x2_t __rev1_135;  __rev1_135 = __builtin_shufflevector(__s1_135, __s1_135, 1, 0); \
+  float64_t __ret_135; \
+  __ret_135 = __s0_135 * __noswap_vgetq_lane_f64(__rev1_135, __p2_135); \
+  __ret_135; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuls_laneq_f32(__p0_136, __p1_136, __p2_136) __extension__ ({ \
+  float32_t __s0_136 = __p0_136; \
+  float32x4_t __s1_136 = __p1_136; \
+  float32_t __ret_136; \
+  __ret_136 = __s0_136 * vgetq_lane_f32(__s1_136, __p2_136); \
+  __ret_136; \
+})
+#else
+#define vmuls_laneq_f32(__p0_137, __p1_137, __p2_137) __extension__ ({ \
+  float32_t __s0_137 = __p0_137; \
+  float32x4_t __s1_137 = __p1_137; \
+  float32x4_t __rev1_137;  __rev1_137 = __builtin_shufflevector(__s1_137, __s1_137, 3, 2, 1, 0); \
+  float32_t __ret_137; \
+  __ret_137 = __s0_137 * __noswap_vgetq_lane_f32(__rev1_137, __p2_137); \
+  __ret_137; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_laneq_v((int8x8_t)__s0, (int8x16_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vmul_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_laneq_v((int8x8_t)__s0, (int8x16_t)__rev1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmul_n_f64(float64x1_t __p0, float64_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmul_n_f64((int8x8_t)__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64x1_t vmul_n_f64(float64x1_t __p0, float64_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmul_n_f64((int8x8_t)__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmulq_n_f64(float64x2_t __p0, float64_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 * (float64x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai float64x2_t vmulq_n_f64(float64x2_t __p0, float64_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 * (float64x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vmull_p64(poly64_t __p0, poly64_t __p1) {
+  poly128_t __ret;
+  __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai poly128_t vmull_p64(poly64_t __p0, poly64_t __p1) {
+  poly128_t __ret;
+  __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1);
+  return __ret;
+}
+__ai poly128_t __noswap_vmull_p64(poly64_t __p0, poly64_t __p1) {
+  poly128_t __ret;
+  __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vmull_high_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly16x8_t __ret;
+  __ret = vmull_p8(vget_high_p8(__p0), vget_high_p8(__p1));
+  return __ret;
+}
+#else
+__ai poly16x8_t vmull_high_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __noswap_vmull_p8(__noswap_vget_high_p8(__rev0), __noswap_vget_high_p8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmull_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmull_u8(vget_high_u8(__p0), vget_high_u8(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmull_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmull_u8(__noswap_vget_high_u8(__rev0), __noswap_vget_high_u8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmull_u32(vget_high_u32(__p0), vget_high_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0), __noswap_vget_high_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmull_u16(vget_high_u16(__p0), vget_high_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0), __noswap_vget_high_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmull_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vmull_s8(vget_high_s8(__p0), vget_high_s8(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vmull_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmull_s8(__noswap_vget_high_s8(__rev0), __noswap_vget_high_s8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vmull_s32(vget_high_s32(__p0), vget_high_s32(__p1));
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vmull_s16(vget_high_s16(__p0), vget_high_s16(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vmull_high_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly128_t __ret;
+  __ret = vmull_p64((poly64_t)(vget_high_p64(__p0)), (poly64_t)(vget_high_p64(__p1)));
+  return __ret;
+}
+#else
+__ai poly128_t vmull_high_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly128_t __ret;
+  __ret = __noswap_vmull_p64((poly64_t)(__noswap_vget_high_p64(__rev0)), (poly64_t)(__noswap_vget_high_p64(__rev1)));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(vget_high_u32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(vget_high_u16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(vget_high_u32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(vget_high_u16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_high_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmull_n_u32(vget_high_u32(__p0), __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_high_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmull_n_u32(__noswap_vget_high_u32(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_high_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmull_n_u16(vget_high_u16(__p0), __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_high_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmull_n_u16(__noswap_vget_high_u16(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = vmull_n_s32(vget_high_s32(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmull_n_s32(__noswap_vget_high_s32(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = vmull_n_s16(vget_high_s16(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmull_n_s16(__noswap_vget_high_s16(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmulx_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmulx_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vmulxd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmulxd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vmulxd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmulxd_f64(__p0, __p1);
+  return __ret;
+}
+__ai float64_t __noswap_vmulxd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmulxd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmulxs_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vmulxs_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
+  return __ret;
+}
+__ai float32_t __noswap_vmulxs_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxd_lane_f64(__p0_138, __p1_138, __p2_138) __extension__ ({ \
+  float64_t __s0_138 = __p0_138; \
+  float64x1_t __s1_138 = __p1_138; \
+  float64_t __ret_138; \
+  __ret_138 = vmulxd_f64(__s0_138, vget_lane_f64(__s1_138, __p2_138)); \
+  __ret_138; \
+})
+#else
+#define vmulxd_lane_f64(__p0_139, __p1_139, __p2_139) __extension__ ({ \
+  float64_t __s0_139 = __p0_139; \
+  float64x1_t __s1_139 = __p1_139; \
+  float64_t __ret_139; \
+  __ret_139 = __noswap_vmulxd_f64(__s0_139, __noswap_vget_lane_f64(__s1_139, __p2_139)); \
+  __ret_139; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxs_lane_f32(__p0_140, __p1_140, __p2_140) __extension__ ({ \
+  float32_t __s0_140 = __p0_140; \
+  float32x2_t __s1_140 = __p1_140; \
+  float32_t __ret_140; \
+  __ret_140 = vmulxs_f32(__s0_140, vget_lane_f32(__s1_140, __p2_140)); \
+  __ret_140; \
+})
+#else
+#define vmulxs_lane_f32(__p0_141, __p1_141, __p2_141) __extension__ ({ \
+  float32_t __s0_141 = __p0_141; \
+  float32x2_t __s1_141 = __p1_141; \
+  float32x2_t __rev1_141;  __rev1_141 = __builtin_shufflevector(__s1_141, __s1_141, 1, 0); \
+  float32_t __ret_141; \
+  __ret_141 = __noswap_vmulxs_f32(__s0_141, __noswap_vget_lane_f32(__rev1_141, __p2_141)); \
+  __ret_141; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = vmulxq_f64(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __noswap_vmulxq_f64(__rev0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = vmulxq_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __noswap_vmulxq_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = vmulx_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulx_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __noswap_vmulx_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxd_laneq_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \
+  float64_t __s0_142 = __p0_142; \
+  float64x2_t __s1_142 = __p1_142; \
+  float64_t __ret_142; \
+  __ret_142 = vmulxd_f64(__s0_142, vgetq_lane_f64(__s1_142, __p2_142)); \
+  __ret_142; \
+})
+#else
+#define vmulxd_laneq_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \
+  float64_t __s0_143 = __p0_143; \
+  float64x2_t __s1_143 = __p1_143; \
+  float64x2_t __rev1_143;  __rev1_143 = __builtin_shufflevector(__s1_143, __s1_143, 1, 0); \
+  float64_t __ret_143; \
+  __ret_143 = __noswap_vmulxd_f64(__s0_143, __noswap_vgetq_lane_f64(__rev1_143, __p2_143)); \
+  __ret_143; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxs_laneq_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \
+  float32_t __s0_144 = __p0_144; \
+  float32x4_t __s1_144 = __p1_144; \
+  float32_t __ret_144; \
+  __ret_144 = vmulxs_f32(__s0_144, vgetq_lane_f32(__s1_144, __p2_144)); \
+  __ret_144; \
+})
+#else
+#define vmulxs_laneq_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \
+  float32_t __s0_145 = __p0_145; \
+  float32x4_t __s1_145 = __p1_145; \
+  float32x4_t __rev1_145;  __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 3, 2, 1, 0); \
+  float32_t __ret_145; \
+  __ret_145 = __noswap_vmulxs_f32(__s0_145, __noswap_vgetq_lane_f32(__rev1_145, __p2_145)); \
+  __ret_145; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = vmulxq_f64(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __noswap_vmulxq_f64(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = vmulxq_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __noswap_vmulxq_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = vmulx_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulx_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __noswap_vmulx_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vnegq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float64x2_t vnegq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vnegq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int64x2_t vnegq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vneg_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float64x1_t vneg_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int64x1_t vneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vnegd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vnegd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vpaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vpaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vpaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vpaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vpaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vpaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vpaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vpaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vpaddd_u64(uint64x2_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vpaddd_u64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vpaddd_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vpaddd_u64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpaddd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpaddd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpaddd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpaddd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vpaddd_s64(int64x2_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vpaddd_s64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vpaddd_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vpaddd_s64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpadds_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpadds_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpadds_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpadds_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vpmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vpmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vpmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vpmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpmaxqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpmaxqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpmaxs_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxs_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpmaxs_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxs_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpmaxnmqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxnmqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpmaxnmqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxnmqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpmaxnms_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxnms_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpmaxnms_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxnms_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vpminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vpminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vpminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vpminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpminqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpminqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpmins_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmins_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpmins_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmins_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpminnmqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminnmqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpminnmqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminnmqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpminnms_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpminnms_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpminnms_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpminnms_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqabsq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqabsq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqabsb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqabsb_s8(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqabsb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqabsb_s8(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqabss_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqabss_s32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqabss_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqabss_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqabsd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vqabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqabsd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqabsh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqabsh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqabsh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqabsh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqadds_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqadds_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqadds_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqadds_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqadds_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqaddh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqdmlals_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlals_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int64_t vqdmlals_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlals_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmlalh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlalh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int32_t vqdmlalh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlalh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlal_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlal_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlal_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlal_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlals_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_lane_s32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlals_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_lane_s32(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlalh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_lane_s16(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlalh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_lane_s16(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlals_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_laneq_s32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlals_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_laneq_s32(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlalh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_laneq_s16(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlalh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_laneq_s16(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqdmlsls_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlsls_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int64_t vqdmlsls_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlsls_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmlslh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlslh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int32_t vqdmlslh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlslh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlsl_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlsl_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlsl_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlsl_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlsl_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlsl_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_lane_s32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlsls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_lane_s32(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlslh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_lane_s16(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlslh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_lane_s16(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_laneq_s32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlsls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_laneq_s32(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlslh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_laneq_s16(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlslh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_laneq_s16(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhs_lane_s32(__p0_146, __p1_146, __p2_146) __extension__ ({ \
+  int32_t __s0_146 = __p0_146; \
+  int32x2_t __s1_146 = __p1_146; \
+  int32_t __ret_146; \
+  __ret_146 = vqdmulhs_s32(__s0_146, vget_lane_s32(__s1_146, __p2_146)); \
+  __ret_146; \
+})
+#else
+#define vqdmulhs_lane_s32(__p0_147, __p1_147, __p2_147) __extension__ ({ \
+  int32_t __s0_147 = __p0_147; \
+  int32x2_t __s1_147 = __p1_147; \
+  int32x2_t __rev1_147;  __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \
+  int32_t __ret_147; \
+  __ret_147 = __noswap_vqdmulhs_s32(__s0_147, __noswap_vget_lane_s32(__rev1_147, __p2_147)); \
+  __ret_147; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhh_lane_s16(__p0_148, __p1_148, __p2_148) __extension__ ({ \
+  int16_t __s0_148 = __p0_148; \
+  int16x4_t __s1_148 = __p1_148; \
+  int16_t __ret_148; \
+  __ret_148 = vqdmulhh_s16(__s0_148, vget_lane_s16(__s1_148, __p2_148)); \
+  __ret_148; \
+})
+#else
+#define vqdmulhh_lane_s16(__p0_149, __p1_149, __p2_149) __extension__ ({ \
+  int16_t __s0_149 = __p0_149; \
+  int16x4_t __s1_149 = __p1_149; \
+  int16x4_t __rev1_149;  __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \
+  int16_t __ret_149; \
+  __ret_149 = __noswap_vqdmulhh_s16(__s0_149, __noswap_vget_lane_s16(__rev1_149, __p2_149)); \
+  __ret_149; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhs_laneq_s32(__p0_150, __p1_150, __p2_150) __extension__ ({ \
+  int32_t __s0_150 = __p0_150; \
+  int32x4_t __s1_150 = __p1_150; \
+  int32_t __ret_150; \
+  __ret_150 = vqdmulhs_s32(__s0_150, vgetq_lane_s32(__s1_150, __p2_150)); \
+  __ret_150; \
+})
+#else
+#define vqdmulhs_laneq_s32(__p0_151, __p1_151, __p2_151) __extension__ ({ \
+  int32_t __s0_151 = __p0_151; \
+  int32x4_t __s1_151 = __p1_151; \
+  int32x4_t __rev1_151;  __rev1_151 = __builtin_shufflevector(__s1_151, __s1_151, 3, 2, 1, 0); \
+  int32_t __ret_151; \
+  __ret_151 = __noswap_vqdmulhs_s32(__s0_151, __noswap_vgetq_lane_s32(__rev1_151, __p2_151)); \
+  __ret_151; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhh_laneq_s16(__p0_152, __p1_152, __p2_152) __extension__ ({ \
+  int16_t __s0_152 = __p0_152; \
+  int16x8_t __s1_152 = __p1_152; \
+  int16_t __ret_152; \
+  __ret_152 = vqdmulhh_s16(__s0_152, vgetq_lane_s16(__s1_152, __p2_152)); \
+  __ret_152; \
+})
+#else
+#define vqdmulhh_laneq_s16(__p0_153, __p1_153, __p2_153) __extension__ ({ \
+  int16_t __s0_153 = __p0_153; \
+  int16x8_t __s1_153 = __p1_153; \
+  int16x8_t __rev1_153;  __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_153; \
+  __ret_153 = __noswap_vqdmulhh_s16(__s0_153, __noswap_vgetq_lane_s16(__rev1_153, __p2_153)); \
+  __ret_153; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqdmulls_s32(int32_t __p0, int32_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmulls_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqdmulls_s32(int32_t __p0, int32_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmulls_s32(__p0, __p1);
+  return __ret;
+}
+__ai int64_t __noswap_vqdmulls_s32(int32_t __p0, int32_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmulls_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmullh_s16(int16_t __p0, int16_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmullh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqdmullh_s16(int16_t __p0, int16_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmullh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqdmullh_s16(int16_t __p0, int16_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmullh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vqdmull_s32(vget_high_s32(__p0), vget_high_s32(__p1));
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vqdmull_s16(vget_high_s16(__p0), vget_high_s16(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = vqdmull_n_s32(vget_high_s32(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmull_n_s32(__noswap_vget_high_s32(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = vqdmull_n_s16(vget_high_s16(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmull_n_s16(__noswap_vget_high_s16(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulls_lane_s32(__p0_154, __p1_154, __p2_154) __extension__ ({ \
+  int32_t __s0_154 = __p0_154; \
+  int32x2_t __s1_154 = __p1_154; \
+  int64_t __ret_154; \
+  __ret_154 = vqdmulls_s32(__s0_154, vget_lane_s32(__s1_154, __p2_154)); \
+  __ret_154; \
+})
+#else
+#define vqdmulls_lane_s32(__p0_155, __p1_155, __p2_155) __extension__ ({ \
+  int32_t __s0_155 = __p0_155; \
+  int32x2_t __s1_155 = __p1_155; \
+  int32x2_t __rev1_155;  __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \
+  int64_t __ret_155; \
+  __ret_155 = __noswap_vqdmulls_s32(__s0_155, __noswap_vget_lane_s32(__rev1_155, __p2_155)); \
+  __ret_155; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmullh_lane_s16(__p0_156, __p1_156, __p2_156) __extension__ ({ \
+  int16_t __s0_156 = __p0_156; \
+  int16x4_t __s1_156 = __p1_156; \
+  int32_t __ret_156; \
+  __ret_156 = vqdmullh_s16(__s0_156, vget_lane_s16(__s1_156, __p2_156)); \
+  __ret_156; \
+})
+#else
+#define vqdmullh_lane_s16(__p0_157, __p1_157, __p2_157) __extension__ ({ \
+  int16_t __s0_157 = __p0_157; \
+  int16x4_t __s1_157 = __p1_157; \
+  int16x4_t __rev1_157;  __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \
+  int32_t __ret_157; \
+  __ret_157 = __noswap_vqdmullh_s16(__s0_157, __noswap_vget_lane_s16(__rev1_157, __p2_157)); \
+  __ret_157; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulls_laneq_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \
+  int32_t __s0_158 = __p0_158; \
+  int32x4_t __s1_158 = __p1_158; \
+  int64_t __ret_158; \
+  __ret_158 = vqdmulls_s32(__s0_158, vgetq_lane_s32(__s1_158, __p2_158)); \
+  __ret_158; \
+})
+#else
+#define vqdmulls_laneq_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \
+  int32_t __s0_159 = __p0_159; \
+  int32x4_t __s1_159 = __p1_159; \
+  int32x4_t __rev1_159;  __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 3, 2, 1, 0); \
+  int64_t __ret_159; \
+  __ret_159 = __noswap_vqdmulls_s32(__s0_159, __noswap_vgetq_lane_s32(__rev1_159, __p2_159)); \
+  __ret_159; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmullh_laneq_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \
+  int16_t __s0_160 = __p0_160; \
+  int16x8_t __s1_160 = __p1_160; \
+  int32_t __ret_160; \
+  __ret_160 = vqdmullh_s16(__s0_160, vgetq_lane_s16(__s1_160, __p2_160)); \
+  __ret_160; \
+})
+#else
+#define vqdmullh_laneq_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \
+  int16_t __s0_161 = __p0_161; \
+  int16x8_t __s1_161 = __p1_161; \
+  int16x8_t __rev1_161;  __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret_161; \
+  __ret_161 = __noswap_vqdmullh_s16(__s0_161, __noswap_vgetq_lane_s16(__rev1_161, __p2_161)); \
+  __ret_161; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqmovns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovns_s32(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqmovns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovns_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqmovnd_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovnd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqmovnd_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovnd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqmovnh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovnh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqmovnh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovnh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqmovns_u32(uint32_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqmovns_u32(__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vqmovns_u32(uint32_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqmovns_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqmovnd_u64(uint64_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqmovnd_u64(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vqmovnd_u64(uint64_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqmovnd_u64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqmovnh_u16(uint16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqmovnh_u16(__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vqmovnh_u16(uint16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqmovnh_u16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vqmovn_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vqmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vqmovn_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vqmovn_u64(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vqmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vqmovn_u64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vqmovn_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint8x16_t vqmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vqmovn_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vqmovn_s32(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vqmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vqmovn_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vqmovn_s64(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vqmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vqmovn_s64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vqmovn_s16(__p1));
+  return __ret;
+}
+#else
+__ai int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vqmovn_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqmovuns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovuns_s32(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqmovuns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovuns_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqmovund_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovund_s64(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqmovund_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovund_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqmovunh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovunh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqmovunh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovunh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16((uint16x4_t)(__p0), vqmovun_s32(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16((uint16x4_t)(__rev0), __noswap_vqmovun_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32((uint32x2_t)(__p0), vqmovun_s64(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32((uint32x2_t)(__rev0), __noswap_vqmovun_s64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8((uint8x8_t)(__p0), vqmovun_s16(__p1));
+  return __ret;
+}
+#else
+__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8((uint8x8_t)(__rev0), __noswap_vqmovun_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqnegq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqnegq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqnegb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqnegb_s8(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqnegb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqnegb_s8(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqnegs_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqnegs_s32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqnegs_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqnegs_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqnegd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vqnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqnegd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqnegh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqnegh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqnegh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqnegh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqrdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqrdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhs_lane_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \
+  int32_t __s0_162 = __p0_162; \
+  int32x2_t __s1_162 = __p1_162; \
+  int32_t __ret_162; \
+  __ret_162 = vqrdmulhs_s32(__s0_162, vget_lane_s32(__s1_162, __p2_162)); \
+  __ret_162; \
+})
+#else
+#define vqrdmulhs_lane_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \
+  int32_t __s0_163 = __p0_163; \
+  int32x2_t __s1_163 = __p1_163; \
+  int32x2_t __rev1_163;  __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 1, 0); \
+  int32_t __ret_163; \
+  __ret_163 = __noswap_vqrdmulhs_s32(__s0_163, __noswap_vget_lane_s32(__rev1_163, __p2_163)); \
+  __ret_163; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhh_lane_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \
+  int16_t __s0_164 = __p0_164; \
+  int16x4_t __s1_164 = __p1_164; \
+  int16_t __ret_164; \
+  __ret_164 = vqrdmulhh_s16(__s0_164, vget_lane_s16(__s1_164, __p2_164)); \
+  __ret_164; \
+})
+#else
+#define vqrdmulhh_lane_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \
+  int16_t __s0_165 = __p0_165; \
+  int16x4_t __s1_165 = __p1_165; \
+  int16x4_t __rev1_165;  __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 3, 2, 1, 0); \
+  int16_t __ret_165; \
+  __ret_165 = __noswap_vqrdmulhh_s16(__s0_165, __noswap_vget_lane_s16(__rev1_165, __p2_165)); \
+  __ret_165; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhs_laneq_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \
+  int32_t __s0_166 = __p0_166; \
+  int32x4_t __s1_166 = __p1_166; \
+  int32_t __ret_166; \
+  __ret_166 = vqrdmulhs_s32(__s0_166, vgetq_lane_s32(__s1_166, __p2_166)); \
+  __ret_166; \
+})
+#else
+#define vqrdmulhs_laneq_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \
+  int32_t __s0_167 = __p0_167; \
+  int32x4_t __s1_167 = __p1_167; \
+  int32x4_t __rev1_167;  __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 3, 2, 1, 0); \
+  int32_t __ret_167; \
+  __ret_167 = __noswap_vqrdmulhs_s32(__s0_167, __noswap_vgetq_lane_s32(__rev1_167, __p2_167)); \
+  __ret_167; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhh_laneq_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \
+  int16_t __s0_168 = __p0_168; \
+  int16x8_t __s1_168 = __p1_168; \
+  int16_t __ret_168; \
+  __ret_168 = vqrdmulhh_s16(__s0_168, vgetq_lane_s16(__s1_168, __p2_168)); \
+  __ret_168; \
+})
+#else
+#define vqrdmulhh_laneq_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \
+  int16_t __s0_169 = __p0_169; \
+  int16x8_t __s1_169 = __p1_169; \
+  int16x8_t __rev1_169;  __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_169; \
+  __ret_169 = __noswap_vqrdmulhh_s16(__s0_169, __noswap_vgetq_lane_s16(__rev1_169, __p2_169)); \
+  __ret_169; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqrdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqrdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqrdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqrdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqrdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqrdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqrdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqrdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqrshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqrshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqrshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqrshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqrshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqrshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqrshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqrshlb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqrshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqrshlb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrshls_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqrshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrshls_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqrshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqrshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrshlh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrshlh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_u32(__p0_170, __p1_170, __p2_170) __extension__ ({ \
+  uint16x4_t __s0_170 = __p0_170; \
+  uint32x4_t __s1_170 = __p1_170; \
+  uint16x8_t __ret_170; \
+  __ret_170 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_170), (uint16x4_t)(vqrshrn_n_u32(__s1_170, __p2_170)))); \
+  __ret_170; \
+})
+#else
+#define vqrshrn_high_n_u32(__p0_171, __p1_171, __p2_171) __extension__ ({ \
+  uint16x4_t __s0_171 = __p0_171; \
+  uint32x4_t __s1_171 = __p1_171; \
+  uint16x4_t __rev0_171;  __rev0_171 = __builtin_shufflevector(__s0_171, __s0_171, 3, 2, 1, 0); \
+  uint32x4_t __rev1_171;  __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \
+  uint16x8_t __ret_171; \
+  __ret_171 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_171), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_171, __p2_171)))); \
+  __ret_171 = __builtin_shufflevector(__ret_171, __ret_171, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_171; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_u64(__p0_172, __p1_172, __p2_172) __extension__ ({ \
+  uint32x2_t __s0_172 = __p0_172; \
+  uint64x2_t __s1_172 = __p1_172; \
+  uint32x4_t __ret_172; \
+  __ret_172 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_172), (uint32x2_t)(vqrshrn_n_u64(__s1_172, __p2_172)))); \
+  __ret_172; \
+})
+#else
+#define vqrshrn_high_n_u64(__p0_173, __p1_173, __p2_173) __extension__ ({ \
+  uint32x2_t __s0_173 = __p0_173; \
+  uint64x2_t __s1_173 = __p1_173; \
+  uint32x2_t __rev0_173;  __rev0_173 = __builtin_shufflevector(__s0_173, __s0_173, 1, 0); \
+  uint64x2_t __rev1_173;  __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 1, 0); \
+  uint32x4_t __ret_173; \
+  __ret_173 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_173), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_173, __p2_173)))); \
+  __ret_173 = __builtin_shufflevector(__ret_173, __ret_173, 3, 2, 1, 0); \
+  __ret_173; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_u16(__p0_174, __p1_174, __p2_174) __extension__ ({ \
+  uint8x8_t __s0_174 = __p0_174; \
+  uint16x8_t __s1_174 = __p1_174; \
+  uint8x16_t __ret_174; \
+  __ret_174 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_174), (uint8x8_t)(vqrshrn_n_u16(__s1_174, __p2_174)))); \
+  __ret_174; \
+})
+#else
+#define vqrshrn_high_n_u16(__p0_175, __p1_175, __p2_175) __extension__ ({ \
+  uint8x8_t __s0_175 = __p0_175; \
+  uint16x8_t __s1_175 = __p1_175; \
+  uint8x8_t __rev0_175;  __rev0_175 = __builtin_shufflevector(__s0_175, __s0_175, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_175;  __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_175; \
+  __ret_175 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_175), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_175, __p2_175)))); \
+  __ret_175 = __builtin_shufflevector(__ret_175, __ret_175, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_175; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_s32(__p0_176, __p1_176, __p2_176) __extension__ ({ \
+  int16x4_t __s0_176 = __p0_176; \
+  int32x4_t __s1_176 = __p1_176; \
+  int16x8_t __ret_176; \
+  __ret_176 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_176), (int16x4_t)(vqrshrn_n_s32(__s1_176, __p2_176)))); \
+  __ret_176; \
+})
+#else
+#define vqrshrn_high_n_s32(__p0_177, __p1_177, __p2_177) __extension__ ({ \
+  int16x4_t __s0_177 = __p0_177; \
+  int32x4_t __s1_177 = __p1_177; \
+  int16x4_t __rev0_177;  __rev0_177 = __builtin_shufflevector(__s0_177, __s0_177, 3, 2, 1, 0); \
+  int32x4_t __rev1_177;  __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \
+  int16x8_t __ret_177; \
+  __ret_177 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_177), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_177, __p2_177)))); \
+  __ret_177 = __builtin_shufflevector(__ret_177, __ret_177, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_177; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_s64(__p0_178, __p1_178, __p2_178) __extension__ ({ \
+  int32x2_t __s0_178 = __p0_178; \
+  int64x2_t __s1_178 = __p1_178; \
+  int32x4_t __ret_178; \
+  __ret_178 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_178), (int32x2_t)(vqrshrn_n_s64(__s1_178, __p2_178)))); \
+  __ret_178; \
+})
+#else
+#define vqrshrn_high_n_s64(__p0_179, __p1_179, __p2_179) __extension__ ({ \
+  int32x2_t __s0_179 = __p0_179; \
+  int64x2_t __s1_179 = __p1_179; \
+  int32x2_t __rev0_179;  __rev0_179 = __builtin_shufflevector(__s0_179, __s0_179, 1, 0); \
+  int64x2_t __rev1_179;  __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 1, 0); \
+  int32x4_t __ret_179; \
+  __ret_179 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_179), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_179, __p2_179)))); \
+  __ret_179 = __builtin_shufflevector(__ret_179, __ret_179, 3, 2, 1, 0); \
+  __ret_179; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \
+  int8x8_t __s0_180 = __p0_180; \
+  int16x8_t __s1_180 = __p1_180; \
+  int8x16_t __ret_180; \
+  __ret_180 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_180), (int8x8_t)(vqrshrn_n_s16(__s1_180, __p2_180)))); \
+  __ret_180; \
+})
+#else
+#define vqrshrn_high_n_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \
+  int8x8_t __s0_181 = __p0_181; \
+  int16x8_t __s1_181 = __p1_181; \
+  int8x8_t __rev0_181;  __rev0_181 = __builtin_shufflevector(__s0_181, __s0_181, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_181;  __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_181; \
+  __ret_181 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_181), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_181, __p2_181)))); \
+  __ret_181 = __builtin_shufflevector(__ret_181, __ret_181, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_181; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_high_n_s32(__p0_182, __p1_182, __p2_182) __extension__ ({ \
+  int16x4_t __s0_182 = __p0_182; \
+  int32x4_t __s1_182 = __p1_182; \
+  int16x8_t __ret_182; \
+  __ret_182 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_182), (int16x4_t)(vqrshrun_n_s32(__s1_182, __p2_182)))); \
+  __ret_182; \
+})
+#else
+#define vqrshrun_high_n_s32(__p0_183, __p1_183, __p2_183) __extension__ ({ \
+  int16x4_t __s0_183 = __p0_183; \
+  int32x4_t __s1_183 = __p1_183; \
+  int16x4_t __rev0_183;  __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \
+  int32x4_t __rev1_183;  __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \
+  int16x8_t __ret_183; \
+  __ret_183 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_183), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_183, __p2_183)))); \
+  __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_183; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_high_n_s64(__p0_184, __p1_184, __p2_184) __extension__ ({ \
+  int32x2_t __s0_184 = __p0_184; \
+  int64x2_t __s1_184 = __p1_184; \
+  int32x4_t __ret_184; \
+  __ret_184 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_184), (int32x2_t)(vqrshrun_n_s64(__s1_184, __p2_184)))); \
+  __ret_184; \
+})
+#else
+#define vqrshrun_high_n_s64(__p0_185, __p1_185, __p2_185) __extension__ ({ \
+  int32x2_t __s0_185 = __p0_185; \
+  int64x2_t __s1_185 = __p1_185; \
+  int32x2_t __rev0_185;  __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \
+  int64x2_t __rev1_185;  __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \
+  int32x4_t __ret_185; \
+  __ret_185 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_185), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_185, __p2_185)))); \
+  __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \
+  __ret_185; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_high_n_s16(__p0_186, __p1_186, __p2_186) __extension__ ({ \
+  int8x8_t __s0_186 = __p0_186; \
+  int16x8_t __s1_186 = __p1_186; \
+  int8x16_t __ret_186; \
+  __ret_186 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_186), (int8x8_t)(vqrshrun_n_s16(__s1_186, __p2_186)))); \
+  __ret_186; \
+})
+#else
+#define vqrshrun_high_n_s16(__p0_187, __p1_187, __p2_187) __extension__ ({ \
+  int8x8_t __s0_187 = __p0_187; \
+  int16x8_t __s1_187 = __p1_187; \
+  int8x8_t __rev0_187;  __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_187;  __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_187; \
+  __ret_187 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_187), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_187, __p2_187)))); \
+  __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_187; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_u32(__p0_188, __p1_188, __p2_188) __extension__ ({ \
+  uint16x4_t __s0_188 = __p0_188; \
+  uint32x4_t __s1_188 = __p1_188; \
+  uint16x8_t __ret_188; \
+  __ret_188 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_188), (uint16x4_t)(vqshrn_n_u32(__s1_188, __p2_188)))); \
+  __ret_188; \
+})
+#else
+#define vqshrn_high_n_u32(__p0_189, __p1_189, __p2_189) __extension__ ({ \
+  uint16x4_t __s0_189 = __p0_189; \
+  uint32x4_t __s1_189 = __p1_189; \
+  uint16x4_t __rev0_189;  __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \
+  uint32x4_t __rev1_189;  __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \
+  uint16x8_t __ret_189; \
+  __ret_189 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_189), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_189, __p2_189)))); \
+  __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_189; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_u64(__p0_190, __p1_190, __p2_190) __extension__ ({ \
+  uint32x2_t __s0_190 = __p0_190; \
+  uint64x2_t __s1_190 = __p1_190; \
+  uint32x4_t __ret_190; \
+  __ret_190 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_190), (uint32x2_t)(vqshrn_n_u64(__s1_190, __p2_190)))); \
+  __ret_190; \
+})
+#else
+#define vqshrn_high_n_u64(__p0_191, __p1_191, __p2_191) __extension__ ({ \
+  uint32x2_t __s0_191 = __p0_191; \
+  uint64x2_t __s1_191 = __p1_191; \
+  uint32x2_t __rev0_191;  __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \
+  uint64x2_t __rev1_191;  __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \
+  uint32x4_t __ret_191; \
+  __ret_191 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_191), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_191, __p2_191)))); \
+  __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 3, 2, 1, 0); \
+  __ret_191; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_u16(__p0_192, __p1_192, __p2_192) __extension__ ({ \
+  uint8x8_t __s0_192 = __p0_192; \
+  uint16x8_t __s1_192 = __p1_192; \
+  uint8x16_t __ret_192; \
+  __ret_192 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_192), (uint8x8_t)(vqshrn_n_u16(__s1_192, __p2_192)))); \
+  __ret_192; \
+})
+#else
+#define vqshrn_high_n_u16(__p0_193, __p1_193, __p2_193) __extension__ ({ \
+  uint8x8_t __s0_193 = __p0_193; \
+  uint16x8_t __s1_193 = __p1_193; \
+  uint8x8_t __rev0_193;  __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_193;  __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_193; \
+  __ret_193 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_193), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_193, __p2_193)))); \
+  __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_193; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \
+  int16x4_t __s0_194 = __p0_194; \
+  int32x4_t __s1_194 = __p1_194; \
+  int16x8_t __ret_194; \
+  __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqshrn_n_s32(__s1_194, __p2_194)))); \
+  __ret_194; \
+})
+#else
+#define vqshrn_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \
+  int16x4_t __s0_195 = __p0_195; \
+  int32x4_t __s1_195 = __p1_195; \
+  int16x4_t __rev0_195;  __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 3, 2, 1, 0); \
+  int32x4_t __rev1_195;  __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 3, 2, 1, 0); \
+  int16x8_t __ret_195; \
+  __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_195, __p2_195)))); \
+  __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_195; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \
+  int32x2_t __s0_196 = __p0_196; \
+  int64x2_t __s1_196 = __p1_196; \
+  int32x4_t __ret_196; \
+  __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqshrn_n_s64(__s1_196, __p2_196)))); \
+  __ret_196; \
+})
+#else
+#define vqshrn_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \
+  int32x2_t __s0_197 = __p0_197; \
+  int64x2_t __s1_197 = __p1_197; \
+  int32x2_t __rev0_197;  __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 1, 0); \
+  int64x2_t __rev1_197;  __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 1, 0); \
+  int32x4_t __ret_197; \
+  __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_197, __p2_197)))); \
+  __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \
+  __ret_197; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \
+  int8x8_t __s0_198 = __p0_198; \
+  int16x8_t __s1_198 = __p1_198; \
+  int8x16_t __ret_198; \
+  __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqshrn_n_s16(__s1_198, __p2_198)))); \
+  __ret_198; \
+})
+#else
+#define vqshrn_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \
+  int8x8_t __s0_199 = __p0_199; \
+  int16x8_t __s1_199 = __p1_199; \
+  int8x8_t __rev0_199;  __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_199;  __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_199; \
+  __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_199, __p2_199)))); \
+  __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_199; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s32(__p0_200, __p1_200, __p2_200) __extension__ ({ \
+  int16x4_t __s0_200 = __p0_200; \
+  int32x4_t __s1_200 = __p1_200; \
+  int16x8_t __ret_200; \
+  __ret_200 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_200), (int16x4_t)(vqshrun_n_s32(__s1_200, __p2_200)))); \
+  __ret_200; \
+})
+#else
+#define vqshrun_high_n_s32(__p0_201, __p1_201, __p2_201) __extension__ ({ \
+  int16x4_t __s0_201 = __p0_201; \
+  int32x4_t __s1_201 = __p1_201; \
+  int16x4_t __rev0_201;  __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \
+  int32x4_t __rev1_201;  __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \
+  int16x8_t __ret_201; \
+  __ret_201 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_201), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_201, __p2_201)))); \
+  __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_201; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s64(__p0_202, __p1_202, __p2_202) __extension__ ({ \
+  int32x2_t __s0_202 = __p0_202; \
+  int64x2_t __s1_202 = __p1_202; \
+  int32x4_t __ret_202; \
+  __ret_202 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_202), (int32x2_t)(vqshrun_n_s64(__s1_202, __p2_202)))); \
+  __ret_202; \
+})
+#else
+#define vqshrun_high_n_s64(__p0_203, __p1_203, __p2_203) __extension__ ({ \
+  int32x2_t __s0_203 = __p0_203; \
+  int64x2_t __s1_203 = __p1_203; \
+  int32x2_t __rev0_203;  __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \
+  int64x2_t __rev1_203;  __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \
+  int32x4_t __ret_203; \
+  __ret_203 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_203), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_203, __p2_203)))); \
+  __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 3, 2, 1, 0); \
+  __ret_203; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s16(__p0_204, __p1_204, __p2_204) __extension__ ({ \
+  int8x8_t __s0_204 = __p0_204; \
+  int16x8_t __s1_204 = __p1_204; \
+  int8x16_t __ret_204; \
+  __ret_204 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_204), (int8x8_t)(vqshrun_n_s16(__s1_204, __p2_204)))); \
+  __ret_204; \
+})
+#else
+#define vqshrun_high_n_s16(__p0_205, __p1_205, __p2_205) __extension__ ({ \
+  int8x8_t __s0_205 = __p0_205; \
+  int16x8_t __s1_205 = __p1_205; \
+  int8x8_t __rev0_205;  __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_205;  __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_205; \
+  __ret_205 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_205), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_205, __p2_205)))); \
+  __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_205; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqsubb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqsubb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqsubb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqsubb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqsubs_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqsubs_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqsubs_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqsubs_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqsubd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqsubd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqsubh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqsubh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqsubh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqsubh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqsubb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqsubb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqsubb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqsubb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqsubs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqsubs_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqsubs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqsubs_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqsubs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqsubs_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqsubd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqsubd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqsubh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqsubh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqsubh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqsubh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqsubh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqsubh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl1_p8(poly8x16_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl1_p8(poly8x16_t __p0, uint8x8_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl1q_p8(poly8x16_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl1q_p8(poly8x16_t __p0, uint8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl2_p8(poly8x16x2_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl2_p8(poly8x16x2_t __p0, uint8x8_t __p1) {
+  poly8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl2q_p8(poly8x16x2_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl2q_p8(poly8x16x2_t __p0, uint8x16_t __p1) {
+  poly8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
+  int8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
+  uint8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
+  int8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl3_p8(poly8x16x3_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl3_p8(poly8x16x3_t __p0, uint8x8_t __p1) {
+  poly8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl3q_p8(poly8x16x3_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl3q_p8(poly8x16x3_t __p0, uint8x16_t __p1) {
+  poly8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
+  uint8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
+  int8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
+  uint8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
+  int8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl4_p8(poly8x16x4_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl4_p8(poly8x16x4_t __p0, uint8x8_t __p1) {
+  poly8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl4q_p8(poly8x16x4_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl4q_p8(poly8x16x4_t __p0, uint8x16_t __p1) {
+  poly8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
+  uint8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
+  int8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
+  uint8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
+  int8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx1_p8(poly8x8_t __p0, poly8x16_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx1_p8(poly8x8_t __p0, poly8x16_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx1q_p8(poly8x16_t __p0, poly8x16_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx1q_p8(poly8x16_t __p0, poly8x16_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx2_p8(poly8x8_t __p0, poly8x16x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx2_p8(poly8x8_t __p0, poly8x16x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx2q_p8(poly8x16_t __p0, poly8x16x2_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx2q_p8(poly8x16_t __p0, poly8x16x2_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx3_p8(poly8x8_t __p0, poly8x16x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx3_p8(poly8x8_t __p0, poly8x16x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx3q_p8(poly8x16_t __p0, poly8x16x3_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx3q_p8(poly8x16_t __p0, poly8x16x3_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx4_p8(poly8x8_t __p0, poly8x16x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx4_p8(poly8x8_t __p0, poly8x16x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx4q_p8(poly8x16_t __p0, poly8x16x4_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx4q_p8(poly8x16_t __p0, poly8x16x4_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vraddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vraddhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vraddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vraddhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vraddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vraddhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vraddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vraddhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vraddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vraddhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vraddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vraddhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vraddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vraddhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vraddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vraddhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vraddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vraddhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vraddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vraddhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vraddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vraddhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vraddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vraddhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrbit_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrbit_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrbitq_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrbitq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrbitq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrbitq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrbitq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vrbitq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrbit_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrbit_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrbit_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrbit_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrecpeq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrecpeq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrecpe_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrecpe_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrecped_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecped_f64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vrecped_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecped_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrecpes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpes_f32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vrecpes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpes_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrecpsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrecpsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrecps_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrecps_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrecpsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpsd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vrecpsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpsd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrecpss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpss_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vrecpss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpss_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrecpxd_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpxd_f64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vrecpxd_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpxd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrecpxs_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vrecpxs_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vrshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vrshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vrshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vrshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_u32(__p0_206, __p1_206, __p2_206) __extension__ ({ \
+  uint16x4_t __s0_206 = __p0_206; \
+  uint32x4_t __s1_206 = __p1_206; \
+  uint16x8_t __ret_206; \
+  __ret_206 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_206), (uint16x4_t)(vrshrn_n_u32(__s1_206, __p2_206)))); \
+  __ret_206; \
+})
+#else
+#define vrshrn_high_n_u32(__p0_207, __p1_207, __p2_207) __extension__ ({ \
+  uint16x4_t __s0_207 = __p0_207; \
+  uint32x4_t __s1_207 = __p1_207; \
+  uint16x4_t __rev0_207;  __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \
+  uint32x4_t __rev1_207;  __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \
+  uint16x8_t __ret_207; \
+  __ret_207 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_207), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_207, __p2_207)))); \
+  __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_207; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_u64(__p0_208, __p1_208, __p2_208) __extension__ ({ \
+  uint32x2_t __s0_208 = __p0_208; \
+  uint64x2_t __s1_208 = __p1_208; \
+  uint32x4_t __ret_208; \
+  __ret_208 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_208), (uint32x2_t)(vrshrn_n_u64(__s1_208, __p2_208)))); \
+  __ret_208; \
+})
+#else
+#define vrshrn_high_n_u64(__p0_209, __p1_209, __p2_209) __extension__ ({ \
+  uint32x2_t __s0_209 = __p0_209; \
+  uint64x2_t __s1_209 = __p1_209; \
+  uint32x2_t __rev0_209;  __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \
+  uint64x2_t __rev1_209;  __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \
+  uint32x4_t __ret_209; \
+  __ret_209 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_209), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_209, __p2_209)))); \
+  __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \
+  __ret_209; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_u16(__p0_210, __p1_210, __p2_210) __extension__ ({ \
+  uint8x8_t __s0_210 = __p0_210; \
+  uint16x8_t __s1_210 = __p1_210; \
+  uint8x16_t __ret_210; \
+  __ret_210 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_210), (uint8x8_t)(vrshrn_n_u16(__s1_210, __p2_210)))); \
+  __ret_210; \
+})
+#else
+#define vrshrn_high_n_u16(__p0_211, __p1_211, __p2_211) __extension__ ({ \
+  uint8x8_t __s0_211 = __p0_211; \
+  uint16x8_t __s1_211 = __p1_211; \
+  uint8x8_t __rev0_211;  __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_211;  __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_211; \
+  __ret_211 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_211), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_211, __p2_211)))); \
+  __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_211; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \
+  int16x4_t __s0_212 = __p0_212; \
+  int32x4_t __s1_212 = __p1_212; \
+  int16x8_t __ret_212; \
+  __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vrshrn_n_s32(__s1_212, __p2_212)))); \
+  __ret_212; \
+})
+#else
+#define vrshrn_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \
+  int16x4_t __s0_213 = __p0_213; \
+  int32x4_t __s1_213 = __p1_213; \
+  int16x4_t __rev0_213;  __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \
+  int32x4_t __rev1_213;  __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \
+  int16x8_t __ret_213; \
+  __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_213, __p2_213)))); \
+  __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_213; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \
+  int32x2_t __s0_214 = __p0_214; \
+  int64x2_t __s1_214 = __p1_214; \
+  int32x4_t __ret_214; \
+  __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vrshrn_n_s64(__s1_214, __p2_214)))); \
+  __ret_214; \
+})
+#else
+#define vrshrn_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \
+  int32x2_t __s0_215 = __p0_215; \
+  int64x2_t __s1_215 = __p1_215; \
+  int32x2_t __rev0_215;  __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \
+  int64x2_t __rev1_215;  __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \
+  int32x4_t __ret_215; \
+  __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_215, __p2_215)))); \
+  __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \
+  __ret_215; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \
+  int8x8_t __s0_216 = __p0_216; \
+  int16x8_t __s1_216 = __p1_216; \
+  int8x16_t __ret_216; \
+  __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vrshrn_n_s16(__s1_216, __p2_216)))); \
+  __ret_216; \
+})
+#else
+#define vrshrn_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \
+  int8x8_t __s0_217 = __p0_217; \
+  int16x8_t __s1_217 = __p1_217; \
+  int8x8_t __rev0_217;  __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_217;  __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_217; \
+  __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_217, __p2_217)))); \
+  __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_217; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrsqrteq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrsqrteq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrsqrte_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrsqrte_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrsqrted_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrted_f64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vrsqrted_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrted_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrsqrtes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtes_f32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vrsqrtes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtes_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrsqrtsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrsqrtsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrsqrts_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrsqrts_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrsqrtsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrtsd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vrsqrtsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrtsd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrsqrtss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtss_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vrsqrtss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtss_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vrsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vrsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vrsubhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vrsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vrsubhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vrsubhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vrsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vrsubhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vrsubhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vrsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vrsubhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vrsubhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vrsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vrsubhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vrsubhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vrsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vrsubhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vrsubhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vrsubhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_u8(__p0_218, __p1_218) __extension__ ({ \
+  uint8x16_t __s0_218 = __p0_218; \
+  uint16x8_t __ret_218; \
+  __ret_218 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_218), __p1_218)); \
+  __ret_218; \
+})
+#else
+#define vshll_high_n_u8(__p0_219, __p1_219) __extension__ ({ \
+  uint8x16_t __s0_219 = __p0_219; \
+  uint8x16_t __rev0_219;  __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_219; \
+  __ret_219 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_219), __p1_219)); \
+  __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_219; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_u32(__p0_220, __p1_220) __extension__ ({ \
+  uint32x4_t __s0_220 = __p0_220; \
+  uint64x2_t __ret_220; \
+  __ret_220 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_220), __p1_220)); \
+  __ret_220; \
+})
+#else
+#define vshll_high_n_u32(__p0_221, __p1_221) __extension__ ({ \
+  uint32x4_t __s0_221 = __p0_221; \
+  uint32x4_t __rev0_221;  __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 3, 2, 1, 0); \
+  uint64x2_t __ret_221; \
+  __ret_221 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_221), __p1_221)); \
+  __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 1, 0); \
+  __ret_221; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_u16(__p0_222, __p1_222) __extension__ ({ \
+  uint16x8_t __s0_222 = __p0_222; \
+  uint32x4_t __ret_222; \
+  __ret_222 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_222), __p1_222)); \
+  __ret_222; \
+})
+#else
+#define vshll_high_n_u16(__p0_223, __p1_223) __extension__ ({ \
+  uint16x8_t __s0_223 = __p0_223; \
+  uint16x8_t __rev0_223;  __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_223; \
+  __ret_223 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_223), __p1_223)); \
+  __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 3, 2, 1, 0); \
+  __ret_223; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_s8(__p0_224, __p1_224) __extension__ ({ \
+  int8x16_t __s0_224 = __p0_224; \
+  int16x8_t __ret_224; \
+  __ret_224 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_224), __p1_224)); \
+  __ret_224; \
+})
+#else
+#define vshll_high_n_s8(__p0_225, __p1_225) __extension__ ({ \
+  int8x16_t __s0_225 = __p0_225; \
+  int8x16_t __rev0_225;  __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_225; \
+  __ret_225 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_225), __p1_225)); \
+  __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_225; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_s32(__p0_226, __p1_226) __extension__ ({ \
+  int32x4_t __s0_226 = __p0_226; \
+  int64x2_t __ret_226; \
+  __ret_226 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_226), __p1_226)); \
+  __ret_226; \
+})
+#else
+#define vshll_high_n_s32(__p0_227, __p1_227) __extension__ ({ \
+  int32x4_t __s0_227 = __p0_227; \
+  int32x4_t __rev0_227;  __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 3, 2, 1, 0); \
+  int64x2_t __ret_227; \
+  __ret_227 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_227), __p1_227)); \
+  __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 1, 0); \
+  __ret_227; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_s16(__p0_228, __p1_228) __extension__ ({ \
+  int16x8_t __s0_228 = __p0_228; \
+  int32x4_t __ret_228; \
+  __ret_228 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_228), __p1_228)); \
+  __ret_228; \
+})
+#else
+#define vshll_high_n_s16(__p0_229, __p1_229) __extension__ ({ \
+  int16x8_t __s0_229 = __p0_229; \
+  int16x8_t __rev0_229;  __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_229; \
+  __ret_229 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_229), __p1_229)); \
+  __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 3, 2, 1, 0); \
+  __ret_229; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u32(__p0_230, __p1_230, __p2_230) __extension__ ({ \
+  uint16x4_t __s0_230 = __p0_230; \
+  uint32x4_t __s1_230 = __p1_230; \
+  uint16x8_t __ret_230; \
+  __ret_230 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_230), (uint16x4_t)(vshrn_n_u32(__s1_230, __p2_230)))); \
+  __ret_230; \
+})
+#else
+#define vshrn_high_n_u32(__p0_231, __p1_231, __p2_231) __extension__ ({ \
+  uint16x4_t __s0_231 = __p0_231; \
+  uint32x4_t __s1_231 = __p1_231; \
+  uint16x4_t __rev0_231;  __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 3, 2, 1, 0); \
+  uint32x4_t __rev1_231;  __rev1_231 = __builtin_shufflevector(__s1_231, __s1_231, 3, 2, 1, 0); \
+  uint16x8_t __ret_231; \
+  __ret_231 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_231), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_231, __p2_231)))); \
+  __ret_231 = __builtin_shufflevector(__ret_231, __ret_231, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_231; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u64(__p0_232, __p1_232, __p2_232) __extension__ ({ \
+  uint32x2_t __s0_232 = __p0_232; \
+  uint64x2_t __s1_232 = __p1_232; \
+  uint32x4_t __ret_232; \
+  __ret_232 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_232), (uint32x2_t)(vshrn_n_u64(__s1_232, __p2_232)))); \
+  __ret_232; \
+})
+#else
+#define vshrn_high_n_u64(__p0_233, __p1_233, __p2_233) __extension__ ({ \
+  uint32x2_t __s0_233 = __p0_233; \
+  uint64x2_t __s1_233 = __p1_233; \
+  uint32x2_t __rev0_233;  __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 1, 0); \
+  uint64x2_t __rev1_233;  __rev1_233 = __builtin_shufflevector(__s1_233, __s1_233, 1, 0); \
+  uint32x4_t __ret_233; \
+  __ret_233 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_233), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_233, __p2_233)))); \
+  __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 3, 2, 1, 0); \
+  __ret_233; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u16(__p0_234, __p1_234, __p2_234) __extension__ ({ \
+  uint8x8_t __s0_234 = __p0_234; \
+  uint16x8_t __s1_234 = __p1_234; \
+  uint8x16_t __ret_234; \
+  __ret_234 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_234), (uint8x8_t)(vshrn_n_u16(__s1_234, __p2_234)))); \
+  __ret_234; \
+})
+#else
+#define vshrn_high_n_u16(__p0_235, __p1_235, __p2_235) __extension__ ({ \
+  uint8x8_t __s0_235 = __p0_235; \
+  uint16x8_t __s1_235 = __p1_235; \
+  uint8x8_t __rev0_235;  __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_235;  __rev1_235 = __builtin_shufflevector(__s1_235, __s1_235, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_235; \
+  __ret_235 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_235), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_235, __p2_235)))); \
+  __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_235; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s32(__p0_236, __p1_236, __p2_236) __extension__ ({ \
+  int16x4_t __s0_236 = __p0_236; \
+  int32x4_t __s1_236 = __p1_236; \
+  int16x8_t __ret_236; \
+  __ret_236 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_236), (int16x4_t)(vshrn_n_s32(__s1_236, __p2_236)))); \
+  __ret_236; \
+})
+#else
+#define vshrn_high_n_s32(__p0_237, __p1_237, __p2_237) __extension__ ({ \
+  int16x4_t __s0_237 = __p0_237; \
+  int32x4_t __s1_237 = __p1_237; \
+  int16x4_t __rev0_237;  __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 3, 2, 1, 0); \
+  int32x4_t __rev1_237;  __rev1_237 = __builtin_shufflevector(__s1_237, __s1_237, 3, 2, 1, 0); \
+  int16x8_t __ret_237; \
+  __ret_237 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_237), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_237, __p2_237)))); \
+  __ret_237 = __builtin_shufflevector(__ret_237, __ret_237, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_237; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s64(__p0_238, __p1_238, __p2_238) __extension__ ({ \
+  int32x2_t __s0_238 = __p0_238; \
+  int64x2_t __s1_238 = __p1_238; \
+  int32x4_t __ret_238; \
+  __ret_238 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_238), (int32x2_t)(vshrn_n_s64(__s1_238, __p2_238)))); \
+  __ret_238; \
+})
+#else
+#define vshrn_high_n_s64(__p0_239, __p1_239, __p2_239) __extension__ ({ \
+  int32x2_t __s0_239 = __p0_239; \
+  int64x2_t __s1_239 = __p1_239; \
+  int32x2_t __rev0_239;  __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 1, 0); \
+  int64x2_t __rev1_239;  __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 1, 0); \
+  int32x4_t __ret_239; \
+  __ret_239 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_239), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_239, __p2_239)))); \
+  __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 3, 2, 1, 0); \
+  __ret_239; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s16(__p0_240, __p1_240, __p2_240) __extension__ ({ \
+  int8x8_t __s0_240 = __p0_240; \
+  int16x8_t __s1_240 = __p1_240; \
+  int8x16_t __ret_240; \
+  __ret_240 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_240), (int8x8_t)(vshrn_n_s16(__s1_240, __p2_240)))); \
+  __ret_240; \
+})
+#else
+#define vshrn_high_n_s16(__p0_241, __p1_241, __p2_241) __extension__ ({ \
+  int8x8_t __s0_241 = __p0_241; \
+  int16x8_t __s1_241 = __p1_241; \
+  int8x8_t __rev0_241;  __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_241;  __rev1_241 = __builtin_shufflevector(__s1_241, __s1_241, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_241; \
+  __ret_241 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_241), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_241, __p2_241)))); \
+  __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_241; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vslid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vslid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vslid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vslid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vslid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vslid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vslid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vslid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vsli_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vsliq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vsqrtq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vsqrtq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vsqrtq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vsqrtq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vsqrt_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vsqrt_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vsqrt_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vsqrt_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vsqrt_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vsri_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vsriq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 6); \
+})
+#else
+#define vst1_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 38); \
+})
+#else
+#define vst1q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 42); \
+})
+#else
+#define vst1q_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 10); \
+})
+#else
+#define vst1_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+})
+#else
+#define vst1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 38); \
+})
+#else
+#define vst1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 42); \
+})
+#else
+#define vst1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+})
+#else
+#define vst1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
+})
+#else
+#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#else
+#define vst1_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
+})
+#else
+#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
+})
+#else
+#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \
+})
+#else
+#define vst1q_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
+})
+#else
+#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
+})
+#else
+#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
+})
+#else
+#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
+})
+#else
+#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
+})
+#else
+#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
+})
+#else
+#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 42); \
+})
+#else
+#define vst1q_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 41); \
+})
+#else
+#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 40); \
+})
+#else
+#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 34); \
+})
+#else
+#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 35); \
+})
+#else
+#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 33); \
+})
+#else
+#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
+})
+#else
+#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
+})
+#else
+#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#else
+#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
+})
+#else
+#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
+})
+#else
+#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#else
+#define vst1_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 9); \
+})
+#else
+#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 8); \
+})
+#else
+#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 2); \
+})
+#else
+#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#else
+#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 1); \
+})
+#else
+#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
+})
+#else
+#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#else
+#define vst1_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
+})
+#else
+#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
+})
+#else
+#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \
+})
+#else
+#define vst1q_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
+})
+#else
+#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
+})
+#else
+#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
+})
+#else
+#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
+})
+#else
+#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
+})
+#else
+#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
+})
+#else
+#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 42); \
+})
+#else
+#define vst1q_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \
+})
+#else
+#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \
+})
+#else
+#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \
+})
+#else
+#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \
+})
+#else
+#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \
+})
+#else
+#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
+})
+#else
+#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
+})
+#else
+#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#else
+#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
+})
+#else
+#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
+})
+#else
+#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#else
+#define vst1_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \
+})
+#else
+#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \
+})
+#else
+#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \
+})
+#else
+#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#else
+#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \
+})
+#else
+#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
+})
+#else
+#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#else
+#define vst1_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
+})
+#else
+#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
+})
+#else
+#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \
+})
+#else
+#define vst1q_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
+})
+#else
+#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
+})
+#else
+#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
+})
+#else
+#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
+})
+#else
+#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
+})
+#else
+#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
+})
+#else
+#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 42); \
+})
+#else
+#define vst1q_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \
+})
+#else
+#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \
+})
+#else
+#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \
+})
+#else
+#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \
+})
+#else
+#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \
+})
+#else
+#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
+})
+#else
+#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
+})
+#else
+#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#else
+#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
+})
+#else
+#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
+})
+#else
+#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#else
+#define vst1_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \
+})
+#else
+#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \
+})
+#else
+#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \
+})
+#else
+#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#else
+#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \
+})
+#else
+#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#else
+#define vst2_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \
+})
+#else
+#define vst2q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
+})
+#else
+#define vst2q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 42); \
+})
+#else
+#define vst2q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 35); \
+})
+#else
+#define vst2q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_f64(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#else
+#define vst2_f64(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+})
+#else
+#define vst2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 36); \
+})
+#else
+#define vst2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 38); \
+})
+#else
+#define vst2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 48); \
+})
+#else
+#define vst2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 51); \
+})
+#else
+#define vst2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 32); \
+})
+#else
+#define vst2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 42); \
+})
+#else
+#define vst2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 35); \
+})
+#else
+#define vst2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+})
+#else
+#define vst2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 10); \
+})
+#else
+#define vst2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 3); \
+})
+#else
+#define vst2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#else
+#define vst3_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \
+})
+#else
+#define vst3q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
+})
+#else
+#define vst3q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 42); \
+})
+#else
+#define vst3q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \
+})
+#else
+#define vst3q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_f64(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#else
+#define vst3_f64(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+})
+#else
+#define vst3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 36); \
+})
+#else
+#define vst3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 38); \
+})
+#else
+#define vst3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 48); \
+})
+#else
+#define vst3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 51); \
+})
+#else
+#define vst3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 32); \
+})
+#else
+#define vst3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 42); \
+})
+#else
+#define vst3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 35); \
+})
+#else
+#define vst3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+})
+#else
+#define vst3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+})
+#else
+#define vst3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+})
+#else
+#define vst3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#else
+#define vst4_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \
+})
+#else
+#define vst4q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
+})
+#else
+#define vst4q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 42); \
+})
+#else
+#define vst4q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \
+})
+#else
+#define vst4q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_f64(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#else
+#define vst4_f64(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+})
+#else
+#define vst4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 36); \
+})
+#else
+#define vst4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 38); \
+})
+#else
+#define vst4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 48); \
+})
+#else
+#define vst4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 51); \
+})
+#else
+#define vst4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 32); \
+})
+#else
+#define vst4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 42); \
+})
+#else
+#define vst4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 35); \
+})
+#else
+#define vst4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+})
+#else
+#define vst4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+})
+#else
+#define vst4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+})
+#else
+#define vst4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vstrq_p128(__p0, __p1) __extension__ ({ \
+  poly128_t __s1 = __p1; \
+  __builtin_neon_vstrq_p128(__p0, __s1); \
+})
+#else
+#define vstrq_p128(__p0, __p1) __extension__ ({ \
+  poly128_t __s1 = __p1; \
+  __builtin_neon_vstrq_p128(__p0, __s1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsubd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsubd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vsubd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vsubd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vsubq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vsubq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vsub_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vsub_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vsubhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vsubhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vsubhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vsubhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vsubhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vsubhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vsubhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vsubhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vsubhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vsubhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vsubhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vsubhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_high_u8(__p0) - vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_high_u8(__rev0) - __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_high_u32(__p0) - vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_high_u32(__rev0) - __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_high_u16(__p0) - vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_high_u16(__rev0) - __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_high_s8(__p0) - vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_high_s8(__rev0) - __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_high_s32(__p0) - vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_high_s32(__rev0) - __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_high_s16(__p0) - vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_high_s16(__rev0) - __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 - vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 - vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 - vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 - vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtrn1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtrn1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vtrn1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai poly16x4_t vtrn1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vtrn1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  return __ret;
+}
+#else
+__ai poly8x16_t vtrn1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vtrn1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai poly64x2_t vtrn1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vtrn1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai poly16x8_t vtrn1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtrn1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtrn1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtrn1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtrn1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtrn1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtrn1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtrn1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtrn1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vtrn1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  return __ret;
+}
+#else
+__ai int8x16_t vtrn1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vtrn1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float64x2_t vtrn1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vtrn1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai float32x4_t vtrn1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vtrn1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai int32x4_t vtrn1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vtrn1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int64x2_t vtrn1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vtrn1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai int16x8_t vtrn1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtrn1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtrn1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtrn1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtrn1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtrn1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtrn1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtrn1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai int8x8_t vtrn1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vtrn1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vtrn1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vtrn1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vtrn1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vtrn1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai int16x4_t vtrn1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtrn2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtrn2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vtrn2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vtrn2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vtrn2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  return __ret;
+}
+#else
+__ai poly8x16_t vtrn2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vtrn2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai poly64x2_t vtrn2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vtrn2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai poly16x8_t vtrn2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtrn2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtrn2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtrn2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtrn2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtrn2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtrn2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtrn2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtrn2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vtrn2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  return __ret;
+}
+#else
+__ai int8x16_t vtrn2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vtrn2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float64x2_t vtrn2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vtrn2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai float32x4_t vtrn2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vtrn2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai int32x4_t vtrn2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vtrn2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int64x2_t vtrn2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vtrn2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai int16x8_t vtrn2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtrn2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtrn2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtrn2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtrn2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtrn2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtrn2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtrn2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vtrn2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vtrn2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vtrn2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vtrn2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vtrn2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vtrn2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vtrn2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vtst_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vtst_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtstq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtstq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtstq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtstq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtstq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtstq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vtst_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vtst_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vtst_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vtst_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vtstd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vtstd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vuzp1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai poly8x8_t vuzp1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vuzp1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai poly16x4_t vuzp1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vuzp1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  return __ret;
+}
+#else
+__ai poly8x16_t vuzp1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vuzp1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai poly64x2_t vuzp1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vuzp1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai poly16x8_t vuzp1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vuzp1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  return __ret;
+}
+#else
+__ai uint8x16_t vuzp1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vuzp1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai uint32x4_t vuzp1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vuzp1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vuzp1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vuzp1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai uint16x8_t vuzp1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vuzp1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  return __ret;
+}
+#else
+__ai int8x16_t vuzp1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vuzp1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float64x2_t vuzp1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vuzp1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai float32x4_t vuzp1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vuzp1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai int32x4_t vuzp1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vuzp1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int64x2_t vuzp1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vuzp1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai int16x8_t vuzp1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vuzp1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai uint8x8_t vuzp1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vuzp1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vuzp1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vuzp1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai uint16x4_t vuzp1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vuzp1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vuzp1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vuzp1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vuzp1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vuzp1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai int16x4_t vuzp1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vuzp2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vuzp2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vuzp2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vuzp2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vuzp2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  return __ret;
+}
+#else
+__ai poly8x16_t vuzp2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vuzp2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai poly64x2_t vuzp2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vuzp2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai poly16x8_t vuzp2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vuzp2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  return __ret;
+}
+#else
+__ai uint8x16_t vuzp2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vuzp2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai uint32x4_t vuzp2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vuzp2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint64x2_t vuzp2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vuzp2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai uint16x8_t vuzp2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vuzp2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  return __ret;
+}
+#else
+__ai int8x16_t vuzp2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vuzp2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float64x2_t vuzp2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vuzp2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai float32x4_t vuzp2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vuzp2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai int32x4_t vuzp2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vuzp2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int64x2_t vuzp2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vuzp2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai int16x8_t vuzp2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vuzp2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vuzp2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vuzp2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vuzp2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vuzp2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vuzp2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vuzp2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vuzp2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vuzp2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vuzp2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vuzp2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vuzp2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vuzp2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vuzp2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vzip1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai poly8x8_t vzip1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vzip1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4_t vzip1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vzip1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  return __ret;
+}
+#else
+__ai poly8x16_t vzip1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vzip1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai poly64x2_t vzip1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vzip1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai poly16x8_t vzip1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vzip1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  return __ret;
+}
+#else
+__ai uint8x16_t vzip1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vzip1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai uint32x4_t vzip1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vzip1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vzip1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vzip1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai uint16x8_t vzip1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vzip1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  return __ret;
+}
+#else
+__ai int8x16_t vzip1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vzip1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float64x2_t vzip1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vzip1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai float32x4_t vzip1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vzip1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai int32x4_t vzip1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vzip1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int64x2_t vzip1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vzip1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai int16x8_t vzip1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vzip1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai uint8x8_t vzip1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vzip1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vzip1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vzip1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai uint16x4_t vzip1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vzip1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai int8x8_t vzip1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vzip1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vzip1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vzip1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vzip1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vzip1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai int16x4_t vzip1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vzip2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vzip2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vzip2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vzip2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vzip2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  return __ret;
+}
+#else
+__ai poly8x16_t vzip2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vzip2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai poly64x2_t vzip2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vzip2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai poly16x8_t vzip2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vzip2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  return __ret;
+}
+#else
+__ai uint8x16_t vzip2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vzip2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai uint32x4_t vzip2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vzip2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint64x2_t vzip2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vzip2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai uint16x8_t vzip2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vzip2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  return __ret;
+}
+#else
+__ai int8x16_t vzip2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vzip2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float64x2_t vzip2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vzip2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai float32x4_t vzip2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vzip2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai int32x4_t vzip2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vzip2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int64x2_t vzip2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vzip2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai int16x8_t vzip2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vzip2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vzip2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vzip2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vzip2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vzip2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vzip2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vzip2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vzip2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vzip2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vzip2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vzip2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vzip2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vzip2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vzip2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vabaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = __p0 + vabdq_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint8x16_t vabaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 + __noswap_vabdq_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vabdq_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vabaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdq_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + vabdq_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vabaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdq_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vabaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = __p0 + vabdq_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int8x16_t vabaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 + __noswap_vabdq_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vabdq_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vabaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdq_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + vabdq_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vabaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdq_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vaba_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = __p0 + vabd_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint8x8_t vaba_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 + __noswap_vabd_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vaba_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 + vabd_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vaba_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __noswap_vabd_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vaba_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 + vabd_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x4_t vaba_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __noswap_vabd_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vaba_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = __p0 + vabd_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int8x8_t vaba_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 + __noswap_vabd_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vaba_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 + vabd_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x2_t vaba_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __noswap_vabd_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vaba_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 + vabd_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x4_t vaba_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __noswap_vabd_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(vmovl_u8((uint8x8_t)(vabd_u8(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai uint16x8_t vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_u8(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_u8(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(vmovl_u32((uint32x2_t)(vabd_u32(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai uint64x2_t vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_u32(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_u32(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(vmovl_u16((uint16x4_t)(vabd_u16(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai uint32x4_t vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_u16(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_u16(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(vmovl_u8((uint8x8_t)(vabd_s8(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai int16x8_t vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_s8(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_s8(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(vmovl_u32((uint32x2_t)(vabd_s32(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai int64x2_t vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_s32(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_s32(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(vmovl_u16((uint16x4_t)(vabd_s16(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai int32x4_t vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_s16(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_s16(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_u8(__p0) + vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_u8(__rev0) + __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_u32(__p0) + vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_u32(__rev0) + __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_u16(__p0) + vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_u16(__rev0) + __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_s8(__p0) + vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_s8(__rev0) + __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_s32(__p0) + vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_s32(__rev0) + __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_s16(__p0) + vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_s16(__rev0) + __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 + vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 + vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 + vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 + vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_f16(__p0_242, __p1_242) __extension__ ({ \
+  float16x4_t __s0_242 = __p0_242; \
+  float16_t __ret_242; \
+float16x4_t __reint_242 = __s0_242; \
+int16_t __reint1_242 = vget_lane_s16(*(int16x4_t *) &__reint_242, __p1_242); \
+  __ret_242 = *(float16_t *) &__reint1_242; \
+  __ret_242; \
+})
+#else
+#define vget_lane_f16(__p0_243, __p1_243) __extension__ ({ \
+  float16x4_t __s0_243 = __p0_243; \
+  float16x4_t __rev0_243;  __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \
+  float16_t __ret_243; \
+float16x4_t __reint_243 = __rev0_243; \
+int16_t __reint1_243 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_243, __p1_243); \
+  __ret_243 = *(float16_t *) &__reint1_243; \
+  __ret_243; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_f16(__p0_244, __p1_244) __extension__ ({ \
+  float16x8_t __s0_244 = __p0_244; \
+  float16_t __ret_244; \
+float16x8_t __reint_244 = __s0_244; \
+int16_t __reint1_244 = vgetq_lane_s16(*(int16x8_t *) &__reint_244, __p1_244); \
+  __ret_244 = *(float16_t *) &__reint1_244; \
+  __ret_244; \
+})
+#else
+#define vgetq_lane_f16(__p0_245, __p1_245) __extension__ ({ \
+  float16x8_t __s0_245 = __p0_245; \
+  float16x8_t __rev0_245;  __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_245; \
+float16x8_t __reint_245 = __rev0_245; \
+int16_t __reint1_245 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_245, __p1_245); \
+  __ret_245 = *(float16_t *) &__reint1_245; \
+  __ret_245; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + vmull_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vmull_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __noswap_vmull_u8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmull_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_u32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmull_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_u16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + vmull_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vmull_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __noswap_vmull_s8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + vmull_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_s32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vmull_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_s16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, (uint32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, (int32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - vmull_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __noswap_vmull_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - __noswap_vmull_u8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmull_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_u32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmull_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_u16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - vmull_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __noswap_vmull_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - __noswap_vmull_s8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - vmull_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_s32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - vmull_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_s16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, (uint32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, (int32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_f16(__p0_246, __p1_246, __p2_246) __extension__ ({ \
+  float16_t __s0_246 = __p0_246; \
+  float16x4_t __s1_246 = __p1_246; \
+  float16x4_t __ret_246; \
+float16_t __reint_246 = __s0_246; \
+float16x4_t __reint1_246 = __s1_246; \
+int16x4_t __reint2_246 = vset_lane_s16(*(int16_t *) &__reint_246, *(int16x4_t *) &__reint1_246, __p2_246); \
+  __ret_246 = *(float16x4_t *) &__reint2_246; \
+  __ret_246; \
+})
+#else
+#define vset_lane_f16(__p0_247, __p1_247, __p2_247) __extension__ ({ \
+  float16_t __s0_247 = __p0_247; \
+  float16x4_t __s1_247 = __p1_247; \
+  float16x4_t __rev1_247;  __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 3, 2, 1, 0); \
+  float16x4_t __ret_247; \
+float16_t __reint_247 = __s0_247; \
+float16x4_t __reint1_247 = __rev1_247; \
+int16x4_t __reint2_247 = __noswap_vset_lane_s16(*(int16_t *) &__reint_247, *(int16x4_t *) &__reint1_247, __p2_247); \
+  __ret_247 = *(float16x4_t *) &__reint2_247; \
+  __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 3, 2, 1, 0); \
+  __ret_247; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_f16(__p0_248, __p1_248, __p2_248) __extension__ ({ \
+  float16_t __s0_248 = __p0_248; \
+  float16x8_t __s1_248 = __p1_248; \
+  float16x8_t __ret_248; \
+float16_t __reint_248 = __s0_248; \
+float16x8_t __reint1_248 = __s1_248; \
+int16x8_t __reint2_248 = vsetq_lane_s16(*(int16_t *) &__reint_248, *(int16x8_t *) &__reint1_248, __p2_248); \
+  __ret_248 = *(float16x8_t *) &__reint2_248; \
+  __ret_248; \
+})
+#else
+#define vsetq_lane_f16(__p0_249, __p1_249, __p2_249) __extension__ ({ \
+  float16_t __s0_249 = __p0_249; \
+  float16x8_t __s1_249 = __p1_249; \
+  float16x8_t __rev1_249;  __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_249; \
+float16_t __reint_249 = __s0_249; \
+float16x8_t __reint1_249 = __rev1_249; \
+int16x8_t __reint2_249 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_249, *(int16x8_t *) &__reint1_249, __p2_249); \
+  __ret_249 = *(float16x8_t *) &__reint2_249; \
+  __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_249; \
+})
+#endif
+
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = vqadds_s32(__p0, vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = __noswap_vqadds_s32(__p0, __noswap_vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = vqaddh_s16(__p0, vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = __noswap_vqaddh_s16(__p0, __noswap_vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahs_lane_s32(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \
+  int32_t __s0_250 = __p0_250; \
+  int32_t __s1_250 = __p1_250; \
+  int32x2_t __s2_250 = __p2_250; \
+  int32_t __ret_250; \
+  __ret_250 = vqadds_s32(__s0_250, vqrdmulhs_s32(__s1_250, vget_lane_s32(__s2_250, __p3_250))); \
+  __ret_250; \
+})
+#else
+#define vqrdmlahs_lane_s32(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \
+  int32_t __s0_251 = __p0_251; \
+  int32_t __s1_251 = __p1_251; \
+  int32x2_t __s2_251 = __p2_251; \
+  int32x2_t __rev2_251;  __rev2_251 = __builtin_shufflevector(__s2_251, __s2_251, 1, 0); \
+  int32_t __ret_251; \
+  __ret_251 = __noswap_vqadds_s32(__s0_251, __noswap_vqrdmulhs_s32(__s1_251, __noswap_vget_lane_s32(__rev2_251, __p3_251))); \
+  __ret_251; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahh_lane_s16(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \
+  int16_t __s0_252 = __p0_252; \
+  int16_t __s1_252 = __p1_252; \
+  int16x4_t __s2_252 = __p2_252; \
+  int16_t __ret_252; \
+  __ret_252 = vqaddh_s16(__s0_252, vqrdmulhh_s16(__s1_252, vget_lane_s16(__s2_252, __p3_252))); \
+  __ret_252; \
+})
+#else
+#define vqrdmlahh_lane_s16(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \
+  int16_t __s0_253 = __p0_253; \
+  int16_t __s1_253 = __p1_253; \
+  int16x4_t __s2_253 = __p2_253; \
+  int16x4_t __rev2_253;  __rev2_253 = __builtin_shufflevector(__s2_253, __s2_253, 3, 2, 1, 0); \
+  int16_t __ret_253; \
+  __ret_253 = __noswap_vqaddh_s16(__s0_253, __noswap_vqrdmulhh_s16(__s1_253, __noswap_vget_lane_s16(__rev2_253, __p3_253))); \
+  __ret_253; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahs_laneq_s32(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \
+  int32_t __s0_254 = __p0_254; \
+  int32_t __s1_254 = __p1_254; \
+  int32x4_t __s2_254 = __p2_254; \
+  int32_t __ret_254; \
+  __ret_254 = vqadds_s32(__s0_254, vqrdmulhs_s32(__s1_254, vgetq_lane_s32(__s2_254, __p3_254))); \
+  __ret_254; \
+})
+#else
+#define vqrdmlahs_laneq_s32(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \
+  int32_t __s0_255 = __p0_255; \
+  int32_t __s1_255 = __p1_255; \
+  int32x4_t __s2_255 = __p2_255; \
+  int32x4_t __rev2_255;  __rev2_255 = __builtin_shufflevector(__s2_255, __s2_255, 3, 2, 1, 0); \
+  int32_t __ret_255; \
+  __ret_255 = __noswap_vqadds_s32(__s0_255, __noswap_vqrdmulhs_s32(__s1_255, __noswap_vgetq_lane_s32(__rev2_255, __p3_255))); \
+  __ret_255; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahh_laneq_s16(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \
+  int16_t __s0_256 = __p0_256; \
+  int16_t __s1_256 = __p1_256; \
+  int16x8_t __s2_256 = __p2_256; \
+  int16_t __ret_256; \
+  __ret_256 = vqaddh_s16(__s0_256, vqrdmulhh_s16(__s1_256, vgetq_lane_s16(__s2_256, __p3_256))); \
+  __ret_256; \
+})
+#else
+#define vqrdmlahh_laneq_s16(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \
+  int16_t __s0_257 = __p0_257; \
+  int16_t __s1_257 = __p1_257; \
+  int16x8_t __s2_257 = __p2_257; \
+  int16x8_t __rev2_257;  __rev2_257 = __builtin_shufflevector(__s2_257, __s2_257, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_257; \
+  __ret_257 = __noswap_vqaddh_s16(__s0_257, __noswap_vqrdmulhh_s16(__s1_257, __noswap_vgetq_lane_s16(__rev2_257, __p3_257))); \
+  __ret_257; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrdmlshs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = vqsubs_s32(__p0, vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32_t vqrdmlshs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = __noswap_vqsubs_s32(__p0, __noswap_vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = vqsubh_s16(__p0, vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = __noswap_vqsubh_s16(__p0, __noswap_vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshs_lane_s32(__p0_258, __p1_258, __p2_258, __p3_258) __extension__ ({ \
+  int32_t __s0_258 = __p0_258; \
+  int32_t __s1_258 = __p1_258; \
+  int32x2_t __s2_258 = __p2_258; \
+  int32_t __ret_258; \
+  __ret_258 = vqsubs_s32(__s0_258, vqrdmulhs_s32(__s1_258, vget_lane_s32(__s2_258, __p3_258))); \
+  __ret_258; \
+})
+#else
+#define vqrdmlshs_lane_s32(__p0_259, __p1_259, __p2_259, __p3_259) __extension__ ({ \
+  int32_t __s0_259 = __p0_259; \
+  int32_t __s1_259 = __p1_259; \
+  int32x2_t __s2_259 = __p2_259; \
+  int32x2_t __rev2_259;  __rev2_259 = __builtin_shufflevector(__s2_259, __s2_259, 1, 0); \
+  int32_t __ret_259; \
+  __ret_259 = __noswap_vqsubs_s32(__s0_259, __noswap_vqrdmulhs_s32(__s1_259, __noswap_vget_lane_s32(__rev2_259, __p3_259))); \
+  __ret_259; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshh_lane_s16(__p0_260, __p1_260, __p2_260, __p3_260) __extension__ ({ \
+  int16_t __s0_260 = __p0_260; \
+  int16_t __s1_260 = __p1_260; \
+  int16x4_t __s2_260 = __p2_260; \
+  int16_t __ret_260; \
+  __ret_260 = vqsubh_s16(__s0_260, vqrdmulhh_s16(__s1_260, vget_lane_s16(__s2_260, __p3_260))); \
+  __ret_260; \
+})
+#else
+#define vqrdmlshh_lane_s16(__p0_261, __p1_261, __p2_261, __p3_261) __extension__ ({ \
+  int16_t __s0_261 = __p0_261; \
+  int16_t __s1_261 = __p1_261; \
+  int16x4_t __s2_261 = __p2_261; \
+  int16x4_t __rev2_261;  __rev2_261 = __builtin_shufflevector(__s2_261, __s2_261, 3, 2, 1, 0); \
+  int16_t __ret_261; \
+  __ret_261 = __noswap_vqsubh_s16(__s0_261, __noswap_vqrdmulhh_s16(__s1_261, __noswap_vget_lane_s16(__rev2_261, __p3_261))); \
+  __ret_261; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshs_laneq_s32(__p0_262, __p1_262, __p2_262, __p3_262) __extension__ ({ \
+  int32_t __s0_262 = __p0_262; \
+  int32_t __s1_262 = __p1_262; \
+  int32x4_t __s2_262 = __p2_262; \
+  int32_t __ret_262; \
+  __ret_262 = vqsubs_s32(__s0_262, vqrdmulhs_s32(__s1_262, vgetq_lane_s32(__s2_262, __p3_262))); \
+  __ret_262; \
+})
+#else
+#define vqrdmlshs_laneq_s32(__p0_263, __p1_263, __p2_263, __p3_263) __extension__ ({ \
+  int32_t __s0_263 = __p0_263; \
+  int32_t __s1_263 = __p1_263; \
+  int32x4_t __s2_263 = __p2_263; \
+  int32x4_t __rev2_263;  __rev2_263 = __builtin_shufflevector(__s2_263, __s2_263, 3, 2, 1, 0); \
+  int32_t __ret_263; \
+  __ret_263 = __noswap_vqsubs_s32(__s0_263, __noswap_vqrdmulhs_s32(__s1_263, __noswap_vgetq_lane_s32(__rev2_263, __p3_263))); \
+  __ret_263; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshh_laneq_s16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
+  int16_t __s0_264 = __p0_264; \
+  int16_t __s1_264 = __p1_264; \
+  int16x8_t __s2_264 = __p2_264; \
+  int16_t __ret_264; \
+  __ret_264 = vqsubh_s16(__s0_264, vqrdmulhh_s16(__s1_264, vgetq_lane_s16(__s2_264, __p3_264))); \
+  __ret_264; \
+})
+#else
+#define vqrdmlshh_laneq_s16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
+  int16_t __s0_265 = __p0_265; \
+  int16_t __s1_265 = __p1_265; \
+  int16x8_t __s2_265 = __p2_265; \
+  int16x8_t __rev2_265;  __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_265; \
+  __ret_265 = __noswap_vqsubh_s16(__s0_265, __noswap_vqrdmulhh_s16(__s1_265, __noswap_vgetq_lane_s16(__rev2_265, __p3_265))); \
+  __ret_265; \
+})
+#endif
+
+#endif
+#if defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabdl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vabdl_u8(vget_high_u8(__p0), vget_high_u8(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vabdl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vabdl_u8(__noswap_vget_high_u8(__rev0), __noswap_vget_high_u8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabdl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vabdl_u32(vget_high_u32(__p0), vget_high_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint64x2_t vabdl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vabdl_u32(__noswap_vget_high_u32(__rev0), __noswap_vget_high_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabdl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vabdl_u16(vget_high_u16(__p0), vget_high_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vabdl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vabdl_u16(__noswap_vget_high_u16(__rev0), __noswap_vget_high_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabdl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vabdl_s8(vget_high_s8(__p0), vget_high_s8(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vabdl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vabdl_s8(__noswap_vget_high_s8(__rev0), __noswap_vget_high_s8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabdl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vabdl_s32(vget_high_s32(__p0), vget_high_s32(__p1));
+  return __ret;
+}
+#else
+__ai int64x2_t vabdl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vabdl_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabdl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vabdl_s16(vget_high_s16(__p0), vget_high_s16(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vabdl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vabdl_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_high_u8(__p0) + vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_high_u8(__rev0) + __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_high_u32(__p0) + vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_high_u32(__rev0) + __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_high_u16(__p0) + vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_high_u16(__rev0) + __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_high_s8(__p0) + vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_high_s8(__rev0) + __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_high_s32(__p0) + vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_high_s32(__rev0) + __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_high_s16(__p0) + vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_high_s16(__rev0) + __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 + vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 + vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 + vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 + vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_p64(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
+  poly64x2_t __s0_266 = __p0_266; \
+  poly64x1_t __s2_266 = __p2_266; \
+  poly64x2_t __ret_266; \
+  __ret_266 = vsetq_lane_p64(vget_lane_p64(__s2_266, __p3_266), __s0_266, __p1_266); \
+  __ret_266; \
+})
+#else
+#define vcopyq_lane_p64(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
+  poly64x2_t __s0_267 = __p0_267; \
+  poly64x1_t __s2_267 = __p2_267; \
+  poly64x2_t __rev0_267;  __rev0_267 = __builtin_shufflevector(__s0_267, __s0_267, 1, 0); \
+  poly64x2_t __ret_267; \
+  __ret_267 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_267, __p3_267), __rev0_267, __p1_267); \
+  __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \
+  __ret_267; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_f64(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
+  float64x2_t __s0_268 = __p0_268; \
+  float64x1_t __s2_268 = __p2_268; \
+  float64x2_t __ret_268; \
+  __ret_268 = vsetq_lane_f64(vget_lane_f64(__s2_268, __p3_268), __s0_268, __p1_268); \
+  __ret_268; \
+})
+#else
+#define vcopyq_lane_f64(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
+  float64x2_t __s0_269 = __p0_269; \
+  float64x1_t __s2_269 = __p2_269; \
+  float64x2_t __rev0_269;  __rev0_269 = __builtin_shufflevector(__s0_269, __s0_269, 1, 0); \
+  float64x2_t __ret_269; \
+  __ret_269 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_269, __p3_269), __rev0_269, __p1_269); \
+  __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 1, 0); \
+  __ret_269; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_p64(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
+  poly64x1_t __s0_270 = __p0_270; \
+  poly64x1_t __s2_270 = __p2_270; \
+  poly64x1_t __ret_270; \
+  __ret_270 = vset_lane_p64(vget_lane_p64(__s2_270, __p3_270), __s0_270, __p1_270); \
+  __ret_270; \
+})
+#else
+#define vcopy_lane_p64(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
+  poly64x1_t __s0_271 = __p0_271; \
+  poly64x1_t __s2_271 = __p2_271; \
+  poly64x1_t __ret_271; \
+  __ret_271 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_271, __p3_271), __s0_271, __p1_271); \
+  __ret_271; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_f64(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
+  float64x1_t __s0_272 = __p0_272; \
+  float64x1_t __s2_272 = __p2_272; \
+  float64x1_t __ret_272; \
+  __ret_272 = vset_lane_f64(vget_lane_f64(__s2_272, __p3_272), __s0_272, __p1_272); \
+  __ret_272; \
+})
+#else
+#define vcopy_lane_f64(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
+  float64x1_t __s0_273 = __p0_273; \
+  float64x1_t __s2_273 = __p2_273; \
+  float64x1_t __ret_273; \
+  __ret_273 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_273, __p3_273), __s0_273, __p1_273); \
+  __ret_273; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_p64(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
+  poly64x2_t __s0_274 = __p0_274; \
+  poly64x2_t __s2_274 = __p2_274; \
+  poly64x2_t __ret_274; \
+  __ret_274 = vsetq_lane_p64(vgetq_lane_p64(__s2_274, __p3_274), __s0_274, __p1_274); \
+  __ret_274; \
+})
+#else
+#define vcopyq_laneq_p64(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
+  poly64x2_t __s0_275 = __p0_275; \
+  poly64x2_t __s2_275 = __p2_275; \
+  poly64x2_t __rev0_275;  __rev0_275 = __builtin_shufflevector(__s0_275, __s0_275, 1, 0); \
+  poly64x2_t __rev2_275;  __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 1, 0); \
+  poly64x2_t __ret_275; \
+  __ret_275 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_275, __p3_275), __rev0_275, __p1_275); \
+  __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \
+  __ret_275; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_f64(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
+  float64x2_t __s0_276 = __p0_276; \
+  float64x2_t __s2_276 = __p2_276; \
+  float64x2_t __ret_276; \
+  __ret_276 = vsetq_lane_f64(vgetq_lane_f64(__s2_276, __p3_276), __s0_276, __p1_276); \
+  __ret_276; \
+})
+#else
+#define vcopyq_laneq_f64(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
+  float64x2_t __s0_277 = __p0_277; \
+  float64x2_t __s2_277 = __p2_277; \
+  float64x2_t __rev0_277;  __rev0_277 = __builtin_shufflevector(__s0_277, __s0_277, 1, 0); \
+  float64x2_t __rev2_277;  __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \
+  float64x2_t __ret_277; \
+  __ret_277 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_277, __p3_277), __rev0_277, __p1_277); \
+  __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 1, 0); \
+  __ret_277; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_p64(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
+  poly64x1_t __s0_278 = __p0_278; \
+  poly64x2_t __s2_278 = __p2_278; \
+  poly64x1_t __ret_278; \
+  __ret_278 = vset_lane_p64(vgetq_lane_p64(__s2_278, __p3_278), __s0_278, __p1_278); \
+  __ret_278; \
+})
+#else
+#define vcopy_laneq_p64(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
+  poly64x1_t __s0_279 = __p0_279; \
+  poly64x2_t __s2_279 = __p2_279; \
+  poly64x2_t __rev2_279;  __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 1, 0); \
+  poly64x1_t __ret_279; \
+  __ret_279 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_279, __p3_279), __s0_279, __p1_279); \
+  __ret_279; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_f64(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
+  float64x1_t __s0_280 = __p0_280; \
+  float64x2_t __s2_280 = __p2_280; \
+  float64x1_t __ret_280; \
+  __ret_280 = vset_lane_f64(vgetq_lane_f64(__s2_280, __p3_280), __s0_280, __p1_280); \
+  __ret_280; \
+})
+#else
+#define vcopy_laneq_f64(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
+  float64x1_t __s0_281 = __p0_281; \
+  float64x2_t __s2_281 = __p2_281; \
+  float64x2_t __rev2_281;  __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 1, 0); \
+  float64x1_t __ret_281; \
+  __ret_281 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_281, __p3_281), __s0_281, __p1_281); \
+  __ret_281; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __ret;
+  __ret = vmlal_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmlal_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlal_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlal_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlal_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlal_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __ret;
+  __ret = vmlal_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vmlal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmlal_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlal_n_u32(__p0, vget_high_u32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlal_n_u32(__rev0, __noswap_vget_high_u32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlal_n_u16(__p0, vget_high_u16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlal_n_u16(__rev0, __noswap_vget_high_u16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlal_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlal_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlal_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlal_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsl_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __ret;
+  __ret = vmlsl_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsl_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmlsl_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlsl_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlsl_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlsl_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlsl_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsl_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __ret;
+  __ret = vmlsl_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsl_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmlsl_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlsl_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlsl_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlsl_n_u32(__p0, vget_high_u32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlsl_n_u32(__rev0, __noswap_vget_high_u32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlsl_n_u16(__p0, vget_high_u16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlsl_n_u16(__rev0, __noswap_vget_high_u16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlsl_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlsl_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlsl_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlsl_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_lane_f64(__p0_282, __p1_282, __p2_282) __extension__ ({ \
+  float64x1_t __s0_282 = __p0_282; \
+  float64x1_t __s1_282 = __p1_282; \
+  float64x1_t __ret_282; \
+  float64_t __x_282 = vget_lane_f64(__s0_282, 0); \
+  float64_t __y_282 = vget_lane_f64(__s1_282, __p2_282); \
+  float64_t __z_282 = vmulxd_f64(__x_282, __y_282); \
+  __ret_282 = vset_lane_f64(__z_282, __s0_282, __p2_282); \
+  __ret_282; \
+})
+#else
+#define vmulx_lane_f64(__p0_283, __p1_283, __p2_283) __extension__ ({ \
+  float64x1_t __s0_283 = __p0_283; \
+  float64x1_t __s1_283 = __p1_283; \
+  float64x1_t __ret_283; \
+  float64_t __x_283 = __noswap_vget_lane_f64(__s0_283, 0); \
+  float64_t __y_283 = __noswap_vget_lane_f64(__s1_283, __p2_283); \
+  float64_t __z_283 = __noswap_vmulxd_f64(__x_283, __y_283); \
+  __ret_283 = __noswap_vset_lane_f64(__z_283, __s0_283, __p2_283); \
+  __ret_283; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_laneq_f64(__p0_284, __p1_284, __p2_284) __extension__ ({ \
+  float64x1_t __s0_284 = __p0_284; \
+  float64x2_t __s1_284 = __p1_284; \
+  float64x1_t __ret_284; \
+  float64_t __x_284 = vget_lane_f64(__s0_284, 0); \
+  float64_t __y_284 = vgetq_lane_f64(__s1_284, __p2_284); \
+  float64_t __z_284 = vmulxd_f64(__x_284, __y_284); \
+  __ret_284 = vset_lane_f64(__z_284, __s0_284, 0); \
+  __ret_284; \
+})
+#else
+#define vmulx_laneq_f64(__p0_285, __p1_285, __p2_285) __extension__ ({ \
+  float64x1_t __s0_285 = __p0_285; \
+  float64x2_t __s1_285 = __p1_285; \
+  float64x2_t __rev1_285;  __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 1, 0); \
+  float64x1_t __ret_285; \
+  float64_t __x_285 = __noswap_vget_lane_f64(__s0_285, 0); \
+  float64_t __y_285 = __noswap_vgetq_lane_f64(__rev1_285, __p2_285); \
+  float64_t __z_285 = __noswap_vmulxd_f64(__x_285, __y_285); \
+  __ret_285 = __noswap_vset_lane_f64(__z_285, __s0_285, 0); \
+  __ret_285; \
+})
+#endif
+
+#endif
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + vabdl_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdl_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __noswap_vabdl_u8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + vabdl_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vabdl_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + __noswap_vabdl_u32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vabdl_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdl_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __noswap_vabdl_u16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + vabdl_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdl_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __noswap_vabdl_s8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + vabdl_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vabdl_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + __noswap_vabdl_s32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vabdl_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdl_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __noswap_vabdl_s16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#if defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __ret;
+  __ret = vabal_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vabal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vabal_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __ret;
+  __ret = vabal_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
+  return __ret;
+}
+#else
+__ai uint64x2_t vabal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vabal_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __ret;
+  __ret = vabal_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vabal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vabal_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __ret;
+  __ret = vabal_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vabal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vabal_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vabal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vabal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vabal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vabal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vabal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vabal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+
+#undef __ai
+
+#endif /* __ARM_NEON_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/armintr.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/armintr.h
new file mode 100644
index 000000000..933afcbb9
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/armintr.h
@@ -0,0 +1,45 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+  _ARM_BARRIER_SY    = 0xF,
+  _ARM_BARRIER_ST    = 0xE,
+  _ARM_BARRIER_ISH   = 0xB,
+  _ARM_BARRIER_ISHST = 0xA,
+  _ARM_BARRIER_NSH   = 0x7,
+  _ARM_BARRIER_NSHST = 0x6,
+  _ARM_BARRIER_OSH   = 0x3,
+  _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx2intrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx2intrin.h
new file mode 100644
index 000000000..13bcbef4d
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx2intrin.h
@@ -0,0 +1,1299 @@
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm256_mpsadbw_epu8(X, Y, M) \
+  (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+                                     (__v32qi)(__m256i)(Y), (int)(M))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qu)__a + (__v32qu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hu)__a + (__v16hu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8su)__a + (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a + (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({        \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                     (__v32qi)(__m256i)(b), (n)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a & (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)(~(__v4du)__a & (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+                                              (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({       \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
+                                   (__v16hi)(__m256i)(V2),   \
+                                   (((M) & 0x01) ? 16 : 0),  \
+                                   (((M) & 0x02) ? 17 : 1),  \
+                                   (((M) & 0x04) ? 18 : 2),  \
+                                   (((M) & 0x08) ? 19 : 3),  \
+                                   (((M) & 0x10) ? 20 : 4),  \
+                                   (((M) & 0x20) ? 21 : 5),  \
+                                   (((M) & 0x40) ? 22 : 6),  \
+                                   (((M) & 0x80) ? 23 : 7),  \
+                                   (((M) & 0x01) ? 24 : 8),  \
+                                   (((M) & 0x02) ? 25 : 9),  \
+                                   (((M) & 0x04) ? 26 : 10), \
+                                   (((M) & 0x08) ? 27 : 11), \
+                                   (((M) & 0x10) ? 28 : 12), \
+                                   (((M) & 0x20) ? 29 : 13), \
+                                   (((M) & 0x40) ? 30 : 14), \
+                                   (((M) & 0x80) ? 31 : 15)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4di)__a == (__v4di)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+  /* This function always performs a signed comparison, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m256i)((__v32qs)__a > (__v32qs)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4di)__a > (__v4di)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm256_movemask_epi8(__m256i __a)
+{
+  return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hu)__a * (__v16hu)__b);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8su)__a * (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a | (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
+                                   (__v8si)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })
+
+#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_undefined_si256(), \
+                                   0, 1, 2, 3, \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3)); })
+
+#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
+                                   12, 13, 14, 15); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector(                                          \
+        (__v32qi)_mm256_setzero_si256(),                                     \
+        (__v32qi)(__m256i)(a),                                               \
+        ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 : 32) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 : 33) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 : 34) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 : 35) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 : 36) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 : 37) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 : 38) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 : 39) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 : 40) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 : 41) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 42) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 43) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 44) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 45) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 46) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 47) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 48) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 49) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 50) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 51) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 52) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 53) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 54) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 55) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 56) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 57) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 58) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 59) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); })
+
+#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psllqi256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector(                                           \
+        (__v32qi)(__m256i)(a),                                               \
+        (__v32qi)_mm256_setzero_si256(),                                     \
+        ((char)(imm)&0xF0) ? 32 : (char)(imm) + ((char)(imm)>0xF ? 16 : 0),  \
+        ((char)(imm)&0xF0) ? 33 : (char)(imm) + ((char)(imm)>0xE ? 17 : 1),  \
+        ((char)(imm)&0xF0) ? 34 : (char)(imm) + ((char)(imm)>0xD ? 18 : 2),  \
+        ((char)(imm)&0xF0) ? 35 : (char)(imm) + ((char)(imm)>0xC ? 19 : 3),  \
+        ((char)(imm)&0xF0) ? 36 : (char)(imm) + ((char)(imm)>0xB ? 20 : 4),  \
+        ((char)(imm)&0xF0) ? 37 : (char)(imm) + ((char)(imm)>0xA ? 21 : 5),  \
+        ((char)(imm)&0xF0) ? 38 : (char)(imm) + ((char)(imm)>0x9 ? 22 : 6),  \
+        ((char)(imm)&0xF0) ? 39 : (char)(imm) + ((char)(imm)>0x8 ? 23 : 7),  \
+        ((char)(imm)&0xF0) ? 40 : (char)(imm) + ((char)(imm)>0x7 ? 24 : 8),  \
+        ((char)(imm)&0xF0) ? 41 : (char)(imm) + ((char)(imm)>0x6 ? 25 : 9),  \
+        ((char)(imm)&0xF0) ? 42 : (char)(imm) + ((char)(imm)>0x5 ? 26 : 10), \
+        ((char)(imm)&0xF0) ? 43 : (char)(imm) + ((char)(imm)>0x4 ? 27 : 11), \
+        ((char)(imm)&0xF0) ? 44 : (char)(imm) + ((char)(imm)>0x3 ? 28 : 12), \
+        ((char)(imm)&0xF0) ? 45 : (char)(imm) + ((char)(imm)>0x2 ? 29 : 13), \
+        ((char)(imm)&0xF0) ? 46 : (char)(imm) + ((char)(imm)>0x1 ? 30 : 14), \
+        ((char)(imm)&0xF0) ? 47 : (char)(imm) + ((char)(imm)>0x0 ? 31 : 15), \
+        ((char)(imm)&0xF0) ? 48 : (char)(imm) + ((char)(imm)>0xF ? 32 : 16), \
+        ((char)(imm)&0xF0) ? 49 : (char)(imm) + ((char)(imm)>0xE ? 33 : 17), \
+        ((char)(imm)&0xF0) ? 50 : (char)(imm) + ((char)(imm)>0xD ? 34 : 18), \
+        ((char)(imm)&0xF0) ? 51 : (char)(imm) + ((char)(imm)>0xC ? 35 : 19), \
+        ((char)(imm)&0xF0) ? 52 : (char)(imm) + ((char)(imm)>0xB ? 36 : 20), \
+        ((char)(imm)&0xF0) ? 53 : (char)(imm) + ((char)(imm)>0xA ? 37 : 21), \
+        ((char)(imm)&0xF0) ? 54 : (char)(imm) + ((char)(imm)>0x9 ? 38 : 22), \
+        ((char)(imm)&0xF0) ? 55 : (char)(imm) + ((char)(imm)>0x8 ? 39 : 23), \
+        ((char)(imm)&0xF0) ? 56 : (char)(imm) + ((char)(imm)>0x7 ? 40 : 24), \
+        ((char)(imm)&0xF0) ? 57 : (char)(imm) + ((char)(imm)>0x6 ? 41 : 25), \
+        ((char)(imm)&0xF0) ? 58 : (char)(imm) + ((char)(imm)>0x5 ? 42 : 26), \
+        ((char)(imm)&0xF0) ? 59 : (char)(imm) + ((char)(imm)>0x4 ? 43 : 27), \
+        ((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \
+        ((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \
+        ((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \
+        ((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); })
+
+#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psrlqi256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qu)__a - (__v32qu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hu)__a - (__v16hu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8su)__a - (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a - (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a ^ (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_stream_load_si256(__m256i const *__V)
+{
+  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_broadcastss_ps(__m128 __X)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_broadcastsd_pd(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcastss_ps(__m128 __X)
+{
+  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcastsd_pd(__m128d __X)
+{
+  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
+}
+
+#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
+                                   (__v4si)(__m128i)(V2),  \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
+                                   (__v8si)(__m256i)(V2),   \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastb_epi8(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastw_epi16(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastd_epi32(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastq_epi64(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastb_epi8(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastw_epi16(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastd_epi32(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastq_epi64(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
+{
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })
+
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })
+
+#define _mm256_extracti128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   (((M) & 1) ? 2 : 0), \
+                                   (((M) & 1) ? 3 : 1) ); })
+
+#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
+                                   (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+                                   (((M) & 1) ? 0 : 4), \
+                                   (((M) & 1) ? 1 : 5), \
+                                   (((M) & 1) ? 4 : 2), \
+                                   (((M) & 1) ? 5 : 3) ); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)(__m256)(mask), (s)); })
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                       (int const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8si)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                       (int const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                             _mm256_setzero_ps(), \
+                                                             _CMP_EQ_OQ), \
+                                       (s)); })
+
+#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                            _mm_setzero_ps()), \
+                                       (s)); })
+
+#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v4si)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                       (int const *)(m), (__v8si)(__m256i)(i), \
+                                       (__v8si)_mm256_set1_epi32(-1), (s)); })
+
+#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v2di)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                       (int const *)(m), (__v4di)(__m256i)(i), \
+                                       (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX2INTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512bwintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512bwintrin.h
new file mode 100644
index 000000000..629dc8611
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512bwintrin.h
@@ -0,0 +1,2360 @@
+/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BWINTRIN_H
+#define __AVX512BWINTRIN_H
+
+typedef unsigned int __mmask32;
+typedef unsigned long long __mmask64;
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_qi(void) {
+  return (__m512i)(__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_hi(void) {
+  return (__m512i)(__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/* Integer compare */
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qu) __A + (__v64qu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qu) __A - (__v64qu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hu) __A + (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hu) __A - (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hu) __A * (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+              (__v64qi) __W,
+              (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+              (__v32hi) __W,
+              (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi8 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi16 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shuffle_epi8(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
+         __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+              (__v32hi) __I /* idx */ ,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
+              /* idx */ ,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) __W,
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
+         __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
+      __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)_mm256_setzero_si256(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)__O,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi) _mm256_setzero_si256(),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) __O,
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+                                          8,  64+8,   9, 64+9,
+                                          10, 64+10, 11, 64+11,
+                                          12, 64+12, 13, 64+13,
+                                          14, 64+14, 15, 64+15,
+                                          24, 64+24, 25, 64+25,
+                                          26, 64+26, 27, 64+27,
+                                          28, 64+28, 29, 64+29,
+                                          30, 64+30, 31, 64+31,
+                                          40, 64+40, 41, 64+41,
+                                          42, 64+42, 43, 64+43,
+                                          44, 64+44, 45, 64+45,
+                                          46, 64+46, 47, 64+47,
+                                          56, 64+56, 57, 64+57,
+                                          58, 64+58, 59, 64+59,
+                                          60, 64+60, 61, 64+61,
+                                          62, 64+62, 63, 64+63);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+                                        (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+                                          4,  32+4,   5, 32+5,
+                                          6,  32+6,   7, 32+7,
+                                          12, 32+12, 13, 32+13,
+                                          14, 32+14, 15, 32+15,
+                                          20, 32+20, 21, 32+21,
+                                          22, 32+22, 23, 32+23,
+                                          28, 32+28, 29, 32+29,
+                                          30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+                                       (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+                                          0,  64+0,   1, 64+1,
+                                          2,  64+2,   3, 64+3,
+                                          4,  64+4,   5, 64+5,
+                                          6,  64+6,   7, 64+7,
+                                          16, 64+16, 17, 64+17,
+                                          18, 64+18, 19, 64+19,
+                                          20, 64+20, 21, 64+21,
+                                          22, 64+22, 23, 64+23,
+                                          32, 64+32, 33, 64+33,
+                                          34, 64+34, 35, 64+35,
+                                          36, 64+36, 37, 64+37,
+                                          38, 64+38, 39, 64+39,
+                                          48, 64+48, 49, 64+49,
+                                          50, 64+50, 51, 64+51,
+                                          52, 64+52, 53, 64+53,
+                                          54, 64+54, 55, 64+55);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+                                        (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+                                          0,  32+0,   1, 32+1,
+                                          2,  32+2,   3, 32+3,
+                                          8,  32+8,   9, 32+9,
+                                          10, 32+10, 11, 32+11,
+                                          16, 32+16, 17, 32+17,
+                                          18, 32+18, 19, 32+19,
+                                          24, 32+24, 25, 32+25,
+                                          26, 32+26, 27, 32+27);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+                                       (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi16(__m256i __A)
+{
+  /* This function always performs a signed extension, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi16(__m256i __A)
+{
+  return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+
+#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)(m)); })
+
+#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)(m)); })
+
+#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)(m)); })
+
+#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)(m)); })
+
+#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
+                                   0, 1, 2, 3, \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3), \
+                                   16, 17, 18, 19, \
+                                   20 + (((imm) >> 0) & 0x3), \
+                                   20 + (((imm) >> 2) & 0x3), \
+                                   20 + (((imm) >> 4) & 0x3), \
+                                   20 + (((imm) >> 6) & 0x3), \
+                                   24, 25, 26, 27, \
+                                   28 + (((imm) >> 0) & 0x3), \
+                                   28 + (((imm) >> 2) & 0x3), \
+                                   28 + (((imm) >> 4) & 0x3), \
+                                   28 + (((imm) >> 6) & 0x3)); })
+
+#define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)(__m512i)(W)); })
+
+#define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)_mm512_setzero_hi()); })
+
+#define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
+                                   12, 13, 14, 15, \
+                                   16 + (((imm) >> 0) & 0x3), \
+                                   16 + (((imm) >> 2) & 0x3), \
+                                   16 + (((imm) >> 4) & 0x3), \
+                                   16 + (((imm) >> 6) & 0x3), \
+                                   20, 21, 22, 23, \
+                                   24 + (((imm) >> 0) & 0x3), \
+                                   24 + (((imm) >> 2) & 0x3), \
+                                   24 + (((imm) >> 4) & 0x3), \
+                                   24 + (((imm) >> 6) & 0x3), \
+                                   28, 29, 30, 31); })
+
+
+#define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)(__m512i)(W)); })
+
+
+#define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)_mm512_setzero_hi()); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi16(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi16(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
+
+#define _mm512_bslli_epi128(a, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector(                                          \
+       (__v64qi)_mm512_setzero_si512(),                                      \
+       (__v64qi)(__m512i)(a),                                                \
+       ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 :  64) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 :  65) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 :  66) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 :  67) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 :  68) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 :  69) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 :  70) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 :  71) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 :  72) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 :  73) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 :  74) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 :  75) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 :  76) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 :  77) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 :  78) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 :  79) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 :  80) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 :  81) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 :  82) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 :  83) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 :  84) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 :  85) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 :  86) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 :  87) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 :  88) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 :  89) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 :  90) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 :  91) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 :  92) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 :  93) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 :  94) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 :  95) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 32 : ((char)(imm)>0x0 ? 48 :  96) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 33 : ((char)(imm)>0x1 ? 49 :  97) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 34 : ((char)(imm)>0x2 ? 50 :  98) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 35 : ((char)(imm)>0x3 ? 51 :  99) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 36 : ((char)(imm)>0x4 ? 52 : 100) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 37 : ((char)(imm)>0x5 ? 53 : 101) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 38 : ((char)(imm)>0x6 ? 54 : 102) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 39 : ((char)(imm)>0x7 ? 55 : 103) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 40 : ((char)(imm)>0x8 ? 56 : 104) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 41 : ((char)(imm)>0x9 ? 57 : 105) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 42 : ((char)(imm)>0xA ? 58 : 106) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 43 : ((char)(imm)>0xB ? 59 : 107) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 44 : ((char)(imm)>0xC ? 60 : 108) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 45 : ((char)(imm)>0xD ? 61 : 109) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 46 : ((char)(imm)>0xE ? 62 : 110) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 47 : ((char)(imm)>0xF ? 63 : 111) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 48 : ((char)(imm)>0x0 ? 64 : 112) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 49 : ((char)(imm)>0x1 ? 65 : 113) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 50 : ((char)(imm)>0x2 ? 66 : 114) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 51 : ((char)(imm)>0x3 ? 67 : 115) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 52 : ((char)(imm)>0x4 ? 68 : 116) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 53 : ((char)(imm)>0x5 ? 69 : 117) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 54 : ((char)(imm)>0x6 ? 70 : 118) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 55 : ((char)(imm)>0x7 ? 71 : 119) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 56 : ((char)(imm)>0x8 ? 72 : 120) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 57 : ((char)(imm)>0x9 ? 73 : 121) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 58 : ((char)(imm)>0xA ? 74 : 122) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 59 : ((char)(imm)>0xB ? 75 : 123) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi16(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi16(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi16(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi16(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
+
+#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector(                     \
+      (__v64qi)(__m512i)(a),                      \
+      (__v64qi)_mm512_setzero_si512(),            \
+      ((char)(imm)&0xF0) ?  64 : (char)(imm) + ((char)(imm)>0xF ?  48 : 0),  \
+      ((char)(imm)&0xF0) ?  65 : (char)(imm) + ((char)(imm)>0xE ?  49 : 1),  \
+      ((char)(imm)&0xF0) ?  66 : (char)(imm) + ((char)(imm)>0xD ?  50 : 2),  \
+      ((char)(imm)&0xF0) ?  67 : (char)(imm) + ((char)(imm)>0xC ?  51 : 3),  \
+      ((char)(imm)&0xF0) ?  68 : (char)(imm) + ((char)(imm)>0xB ?  52 : 4),  \
+      ((char)(imm)&0xF0) ?  69 : (char)(imm) + ((char)(imm)>0xA ?  53 : 5),  \
+      ((char)(imm)&0xF0) ?  70 : (char)(imm) + ((char)(imm)>0x9 ?  54 : 6),  \
+      ((char)(imm)&0xF0) ?  71 : (char)(imm) + ((char)(imm)>0x8 ?  55 : 7),  \
+      ((char)(imm)&0xF0) ?  72 : (char)(imm) + ((char)(imm)>0x7 ?  56 : 8),  \
+      ((char)(imm)&0xF0) ?  73 : (char)(imm) + ((char)(imm)>0x6 ?  57 : 9),  \
+      ((char)(imm)&0xF0) ?  74 : (char)(imm) + ((char)(imm)>0x5 ?  58 : 10), \
+      ((char)(imm)&0xF0) ?  75 : (char)(imm) + ((char)(imm)>0x4 ?  59 : 11), \
+      ((char)(imm)&0xF0) ?  76 : (char)(imm) + ((char)(imm)>0x3 ?  60 : 12), \
+      ((char)(imm)&0xF0) ?  77 : (char)(imm) + ((char)(imm)>0x2 ?  61 : 13), \
+      ((char)(imm)&0xF0) ?  78 : (char)(imm) + ((char)(imm)>0x1 ?  62 : 14), \
+      ((char)(imm)&0xF0) ?  79 : (char)(imm) + ((char)(imm)>0x0 ?  63 : 15), \
+      ((char)(imm)&0xF0) ?  80 : (char)(imm) + ((char)(imm)>0xF ?  64 : 16), \
+      ((char)(imm)&0xF0) ?  81 : (char)(imm) + ((char)(imm)>0xE ?  65 : 17), \
+      ((char)(imm)&0xF0) ?  82 : (char)(imm) + ((char)(imm)>0xD ?  66 : 18), \
+      ((char)(imm)&0xF0) ?  83 : (char)(imm) + ((char)(imm)>0xC ?  67 : 19), \
+      ((char)(imm)&0xF0) ?  84 : (char)(imm) + ((char)(imm)>0xB ?  68 : 20), \
+      ((char)(imm)&0xF0) ?  85 : (char)(imm) + ((char)(imm)>0xA ?  69 : 21), \
+      ((char)(imm)&0xF0) ?  86 : (char)(imm) + ((char)(imm)>0x9 ?  70 : 22), \
+      ((char)(imm)&0xF0) ?  87 : (char)(imm) + ((char)(imm)>0x8 ?  71 : 23), \
+      ((char)(imm)&0xF0) ?  88 : (char)(imm) + ((char)(imm)>0x7 ?  72 : 24), \
+      ((char)(imm)&0xF0) ?  89 : (char)(imm) + ((char)(imm)>0x6 ?  73 : 25), \
+      ((char)(imm)&0xF0) ?  90 : (char)(imm) + ((char)(imm)>0x5 ?  74 : 26), \
+      ((char)(imm)&0xF0) ?  91 : (char)(imm) + ((char)(imm)>0x4 ?  75 : 27), \
+      ((char)(imm)&0xF0) ?  92 : (char)(imm) + ((char)(imm)>0x3 ?  76 : 28), \
+      ((char)(imm)&0xF0) ?  93 : (char)(imm) + ((char)(imm)>0x2 ?  77 : 29), \
+      ((char)(imm)&0xF0) ?  94 : (char)(imm) + ((char)(imm)>0x1 ?  78 : 30), \
+      ((char)(imm)&0xF0) ?  95 : (char)(imm) + ((char)(imm)>0x0 ?  79 : 31), \
+      ((char)(imm)&0xF0) ?  96 : (char)(imm) + ((char)(imm)>0xF ?  80 : 32), \
+      ((char)(imm)&0xF0) ?  97 : (char)(imm) + ((char)(imm)>0xE ?  81 : 33), \
+      ((char)(imm)&0xF0) ?  98 : (char)(imm) + ((char)(imm)>0xD ?  82 : 34), \
+      ((char)(imm)&0xF0) ?  99 : (char)(imm) + ((char)(imm)>0xC ?  83 : 35), \
+      ((char)(imm)&0xF0) ? 100 : (char)(imm) + ((char)(imm)>0xB ?  84 : 36), \
+      ((char)(imm)&0xF0) ? 101 : (char)(imm) + ((char)(imm)>0xA ?  85 : 37), \
+      ((char)(imm)&0xF0) ? 102 : (char)(imm) + ((char)(imm)>0x9 ?  86 : 38), \
+      ((char)(imm)&0xF0) ? 103 : (char)(imm) + ((char)(imm)>0x8 ?  87 : 39), \
+      ((char)(imm)&0xF0) ? 104 : (char)(imm) + ((char)(imm)>0x7 ?  88 : 40), \
+      ((char)(imm)&0xF0) ? 105 : (char)(imm) + ((char)(imm)>0x6 ?  89 : 41), \
+      ((char)(imm)&0xF0) ? 106 : (char)(imm) + ((char)(imm)>0x5 ?  90 : 42), \
+      ((char)(imm)&0xF0) ? 107 : (char)(imm) + ((char)(imm)>0x4 ?  91 : 43), \
+      ((char)(imm)&0xF0) ? 108 : (char)(imm) + ((char)(imm)>0x3 ?  92 : 44), \
+      ((char)(imm)&0xF0) ? 109 : (char)(imm) + ((char)(imm)>0x2 ?  93 : 45), \
+      ((char)(imm)&0xF0) ? 110 : (char)(imm) + ((char)(imm)>0x1 ?  94 : 46), \
+      ((char)(imm)&0xF0) ? 111 : (char)(imm) + ((char)(imm)>0x0 ?  95 : 47), \
+      ((char)(imm)&0xF0) ? 112 : (char)(imm) + ((char)(imm)>0xF ?  96 : 48), \
+      ((char)(imm)&0xF0) ? 113 : (char)(imm) + ((char)(imm)>0xE ?  97 : 49), \
+      ((char)(imm)&0xF0) ? 114 : (char)(imm) + ((char)(imm)>0xD ?  98 : 50), \
+      ((char)(imm)&0xF0) ? 115 : (char)(imm) + ((char)(imm)>0xC ?  99 : 51), \
+      ((char)(imm)&0xF0) ? 116 : (char)(imm) + ((char)(imm)>0xB ? 100 : 52), \
+      ((char)(imm)&0xF0) ? 117 : (char)(imm) + ((char)(imm)>0xA ? 101 : 53), \
+      ((char)(imm)&0xF0) ? 118 : (char)(imm) + ((char)(imm)>0x9 ? 102 : 54), \
+      ((char)(imm)&0xF0) ? 119 : (char)(imm) + ((char)(imm)>0x8 ? 103 : 55), \
+      ((char)(imm)&0xF0) ? 120 : (char)(imm) + ((char)(imm)>0x7 ? 104 : 56), \
+      ((char)(imm)&0xF0) ? 121 : (char)(imm) + ((char)(imm)>0x6 ? 105 : 57), \
+      ((char)(imm)&0xF0) ? 122 : (char)(imm) + ((char)(imm)>0x5 ? 106 : 58), \
+      ((char)(imm)&0xF0) ? 123 : (char)(imm) + ((char)(imm)>0x4 ? 107 : 59), \
+      ((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \
+      ((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \
+      ((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \
+      ((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+                (__v32hi) __A,
+                (__v32hi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+                (__v32hi) __A,
+                (__v32hi) _mm512_setzero_hi ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+                (__v64qi) __A,
+                (__v64qi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+                (__v64qi) __A,
+                (__v64qi) _mm512_setzero_hi ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                 (__v64qi) __O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                 (__v64qi)
+                 _mm512_setzero_qi(),
+                 __M);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+                (__mmask64) __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+                (__mmask32) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+                 (__v32hi)
+                 _mm512_setzero_hi (),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+                 (__v64qi)
+                 _mm512_setzero_hi (),
+                 (__mmask64) __U);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
+{
+  __builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
+             (__v32hi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
+{
+  __builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
+             (__v64qi) __A,
+             (__mmask64) __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_test_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B,
+            (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_test_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B,
+             (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_movepi8_mask (__m512i __A)
+{
+  return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_movepi16_mask (__m512i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi8 (__mmask64 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi16 (__mmask32 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastb_epi8 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v16qi) __A,
+                                          (__v16qi)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__M,
+                                             (__v64qi) _mm512_broadcastb_epi8(__A),
+                                             (__v64qi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__M,
+                                             (__v64qi) _mm512_broadcastb_epi8(__A),
+                                             (__v64qi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) __O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastw_epi16 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v8hi) __A,
+                                          (__v8hi)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__M,
+                                             (__v32hi) _mm512_broadcastw_epi16(__A),
+                                             (__v32hi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__M,
+                                             (__v32hi) _mm512_broadcastw_epi16(__A),
+                                             (__v32hi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_undefined_epi32 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) __W,
+                 (__mmask32) __M);
+}
+
+#define _mm512_alignr_epi8(A, B, N) __extension__ ({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)_mm512_undefined_pd(), \
+                                          (__mmask64)-1); })
+
+#define _mm512_mask_alignr_epi8(W, U, A, B, N) __extension__({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)(__m512i)(W), \
+                                          (__mmask64)(U)); })
+
+#define _mm512_maskz_alignr_epi8(U, A, B, N) __extension__({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)_mm512_setzero_si512(), \
+                                          (__mmask64)(U)); })
+
+#define _mm512_dbsad_epu8(A, B, imm) __extension__ ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)_mm512_undefined_epi32(), \
+                                           (__mmask32)-1); })
+
+#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)(__m512i)(W), \
+                                           (__mmask32)(U)); })
+
+#define _mm512_maskz_dbsad_epu8(U, A, B, imm) ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)_mm512_setzero_hi(), \
+                                           (__mmask32)(U)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sad_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
+               (__v64qi) __B);
+}
+
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512cdintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512cdintrin.h
new file mode 100644
index 000000000..23c423584
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512cdintrin.h
@@ -0,0 +1,144 @@
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+               (__v8di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512dqintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512dqintrin.h
new file mode 100644
index 000000000..ae44b98a9
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512dqintrin.h
@@ -0,0 +1,1332 @@
+/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512DQINTRIN_H
+#define __AVX512DQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_xor_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A ^ (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_xor_ps (__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A ^ (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_or_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A | (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_or_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A | (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_and_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_and_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A & (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_andnot_pd(__m512d __A, __m512d __B) {
+  return (__m512d)(~(__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_andnot_ps(__m512 __A, __m512 __B) {
+  return (__m512)(~(__v16su)__A & (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu64(A, R) __extension__ ({               \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) __extension__ ({     \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi64(A, R) __extension__ ({             \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epi64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epu64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_pd(A, R) __extension__ ({          \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) __W,
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ps(A, R) __extension__ ({        \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi64(A, R) __extension__ ({             \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epu64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi64(A, R) __extension__ ({            \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) __extension__ ({  \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epu64(A, R) __extension__ ({            \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) __extension__ ({  \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_pd(A, R) __extension__ ({          \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ps(A, R) __extension__ ({         \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                           (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_range_pd(A, B, C) __extension__ ({                     \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_range_pd(W, U, A, B, C) __extension__ ({      \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_range_pd(U, A, B, C) __extension__ ({           \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_range_round_pd(A, B, C, R) __extension__ ({           \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_range_round_pd(W, U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_range_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm512_range_ps(A, B, C) __extension__ ({                       \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_range_ps(W, U, A, B, C) __extension__ ({         \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_range_ps(U, A, B, C) __extension__ ({      \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_range_round_ps(A, B, C, R) __extension__ ({         \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_range_round_ps(W, U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_range_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U), (int)(R)); })
+
+#define _mm_range_round_ss(A, B, C, R) __extension__ ({           \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8) -1, (int)(C),\
+                                               (int)(R)); })
+
+#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_range_round_ss(W, U, A, B, C, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W),\
+                                               (__mmask8)(U), (int)(C),\
+                                               (int)(R)); })
+
+#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_range_round_ss(U, A, B, C, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(C),\
+                                               (int)(R)); })
+
+#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_range_round_sd(A, B, C, R) __extension__ ({           \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8) -1, (int)(C),\
+                                                (int)(R)); })
+
+#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_range_round_sd(W, U, A, B, C, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W),\
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)); })
+
+#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_range_round_sd(U, A, B, C, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)); })
+
+#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_reduce_pd(A, B) __extension__ ({             \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_reduce_pd(U, A, B) __extension__ ({  \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_reduce_ps(A, B) __extension__ ({              \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_reduce_ps(W, U, A, B) __extension__ ({   \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_reduce_ps(U, A, B) __extension__ ({       \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_reduce_round_pd(A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_reduce_round_pd(W, U, A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_reduce_round_pd(U, A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_reduce_round_ps(A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_reduce_round_ps(W, U, A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_reduce_round_ps(U, A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm_reduce_ss(A, B, C) __extension__ ({              \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_reduce_ss(W, U, A, B, C) __extension__ ({   \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_reduce_ss(U, A, B, C) __extension__ ({       \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), \
+                                       (__mmask8)(U), (int)(C), \
+                                       _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_reduce_round_ss(A, B, C, R) __extension__ ({              \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                       (int)(C), (int)(R)); })
+
+#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) __extension__ ({   \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                       (int)(C), (int)(R)); })
+
+#define _mm_maskz_reduce_round_ss(U, A, B, C, R) __extension__ ({       \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), \
+                                       (__mmask8)(U), (int)(C), (int)(R)); })
+
+#define _mm_reduce_sd(A, B, C) __extension__ ({              \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)-1, (int)(C), \
+                                        _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_reduce_sd(W, U, A, B, C) __extension__ ({   \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_reduce_sd(U, A, B, C) __extension__ ({       \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)(U), (int)(C), \
+                                        _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_reduce_round_sd(A, B, C, R) __extension__ ({              \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)-1, (int)(C), (int)(R)); })
+
+#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) __extension__ ({   \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                        (int)(C), (int)(R)); })
+
+#define _mm_maskz_reduce_round_sd(U, A, B, C, R) __extension__ ({       \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)(U), (int)(C), (int)(R)); })
+                     
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_movepi32_mask (__m512i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_movepi64_mask (__m512i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
+}
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x2 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_undefined_ps(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)
+                __O, __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x8 (__m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                _mm512_undefined_ps(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)__O,
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x2 (__m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_undefined_pd(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)
+                 __O, __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x8 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)__O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512 (),
+                 __M);
+}
+
+#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v16sf)(__m512)(A),           \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                  ((imm) & 1) ?  8 : 0,           \
+                                  ((imm) & 1) ?  9 : 1,           \
+                                  ((imm) & 1) ? 10 : 2,           \
+                                  ((imm) & 1) ? 11 : 3,           \
+                                  ((imm) & 1) ? 12 : 4,           \
+                                  ((imm) & 1) ? 13 : 5,           \
+                                  ((imm) & 1) ? 14 : 6,           \
+                                  ((imm) & 1) ? 15 : 7); })
+
+#define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                   (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+                                   (__v8sf)(W)); })
+
+#define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                   (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+                                   (__v8sf)_mm256_setzero_ps()); })
+
+#define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + ((imm) & 0x3) * 2,         \
+                                   1 + ((imm) & 0x3) * 2); })
+
+#define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)(W)); })
+
+#define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)_mm_setzero_pd()); })
+
+#define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   ((imm) & 1) ?  8 : 0,              \
+                                   ((imm) & 1) ?  9 : 1,              \
+                                   ((imm) & 1) ? 10 : 2,              \
+                                   ((imm) & 1) ? 11 : 3,              \
+                                   ((imm) & 1) ? 12 : 4,              \
+                                   ((imm) & 1) ? 13 : 5,              \
+                                   ((imm) & 1) ? 14 : 6,              \
+                                   ((imm) & 1) ? 15 : 7); })
+
+#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+                                (__v8si)(W)); })
+
+#define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+                                (__v8si)_mm256_setzero_si256()); })
+
+#define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A),          \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   0 + ((imm) & 0x3) * 2,           \
+                                   1 + ((imm) & 0x3) * 2); })
+
+#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)(W)); })
+
+#define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)_mm_setzero_di()); })
+
+#define _mm512_insertf32x8(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                  (__v16sf)_mm512_castps256_ps512((__m256)(B)),\
+                                  ((imm) & 0x1) ?  0 : 16, \
+                                  ((imm) & 0x1) ?  1 : 17, \
+                                  ((imm) & 0x1) ?  2 : 18, \
+                                  ((imm) & 0x1) ?  3 : 19, \
+                                  ((imm) & 0x1) ?  4 : 20, \
+                                  ((imm) & 0x1) ?  5 : 21, \
+                                  ((imm) & 0x1) ?  6 : 22, \
+                                  ((imm) & 0x1) ?  7 : 23, \
+                                  ((imm) & 0x1) ? 16 :  8, \
+                                  ((imm) & 0x1) ? 17 :  9, \
+                                  ((imm) & 0x1) ? 18 : 10, \
+                                  ((imm) & 0x1) ? 19 : 11, \
+                                  ((imm) & 0x1) ? 20 : 12, \
+                                  ((imm) & 0x1) ? 21 : 13, \
+                                  ((imm) & 0x1) ? 22 : 14, \
+                                  ((imm) & 0x1) ? 23 : 15); })
+
+#define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)(W)); })
+
+#define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps()); })
+
+#define _mm512_insertf64x2(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                  (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\
+                                  (((imm) & 0x3) == 0) ? 8 : 0, \
+                                  (((imm) & 0x3) == 0) ? 9 : 1, \
+                                  (((imm) & 0x3) == 1) ? 8 : 2, \
+                                  (((imm) & 0x3) == 1) ? 9 : 3, \
+                                  (((imm) & 0x3) == 2) ? 8 : 4, \
+                                  (((imm) & 0x3) == 2) ? 9 : 5, \
+                                  (((imm) & 0x3) == 3) ? 8 : 6, \
+                                  (((imm) & 0x3) == 3) ? 9 : 7); })
+
+#define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)(W)); })
+
+#define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_inserti32x8(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                 (__v16si)_mm512_castsi256_si512((__m256i)(B)),\
+                                 ((imm) & 0x1) ?  0 : 16, \
+                                 ((imm) & 0x1) ?  1 : 17, \
+                                 ((imm) & 0x1) ?  2 : 18, \
+                                 ((imm) & 0x1) ?  3 : 19, \
+                                 ((imm) & 0x1) ?  4 : 20, \
+                                 ((imm) & 0x1) ?  5 : 21, \
+                                 ((imm) & 0x1) ?  6 : 22, \
+                                 ((imm) & 0x1) ?  7 : 23, \
+                                 ((imm) & 0x1) ? 16 :  8, \
+                                 ((imm) & 0x1) ? 17 :  9, \
+                                 ((imm) & 0x1) ? 18 : 10, \
+                                 ((imm) & 0x1) ? 19 : 11, \
+                                 ((imm) & 0x1) ? 20 : 12, \
+                                 ((imm) & 0x1) ? 21 : 13, \
+                                 ((imm) & 0x1) ? 22 : 14, \
+                                 ((imm) & 0x1) ? 23 : 15); })
+
+#define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)(W)); })
+
+#define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512()); })
+
+#define _mm512_inserti64x2(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+                                  (__v8di)_mm512_castsi128_si512((__m128i)(B)),\
+                                  (((imm) & 0x3) == 0) ? 8 : 0, \
+                                  (((imm) & 0x3) == 0) ? 9 : 1, \
+                                  (((imm) & 0x3) == 1) ? 8 : 2, \
+                                  (((imm) & 0x3) == 1) ? 9 : 3, \
+                                  (((imm) & 0x3) == 2) ? 8 : 4, \
+                                  (((imm) & 0x3) == 2) ? 9 : 5, \
+                                  (((imm) & 0x3) == 3) ? 8 : 6, \
+                                  (((imm) & 0x3) == 3) ? 9 : 7); })
+
+#define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)(W)); })
+
+#define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512()); })
+
+#define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                              (int)(imm), (__mmask16)(U)); })
+
+#define _mm512_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                              (int)(imm), (__mmask16)-1); })
+
+#define _mm512_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm512_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_fpclass_sd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_fpclass_sd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                          (__mmask8)(U)); })
+
+#define _mm_fpclass_ss_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_fpclass_ss_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                          (__mmask8)(U)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512erintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512erintrin.h
new file mode 100644
index 000000000..8ff212c42
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512erintrin.h
@@ -0,0 +1,285 @@
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+// exp2a23
+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                      (int)(R)); })
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm512_exp2a23_pd(A) \
+  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                     (int)(R)); })
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask16)(M), (int)(R)); })
+
+#define _mm512_exp2a23_ps(A) \
+  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+// rsqrt28
+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(M), (int)(R)); })
+
+#define _mm512_rsqrt28_pd(A) \
+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(M), (int)(R)); })
+
+#define _mm512_rsqrt28_ps(A) \
+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)(__m128)(S), \
+                                              (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(M), (int)(R)); })
+
+#define _mm_rsqrt28_ss(A, B) \
+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)(__m128d)(S), \
+                                               (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(M), (int)(R)); })
+
+#define _mm_rsqrt28_sd(A, B) \
+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+// rcp28
+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (int)(R)); })
+
+#define _mm512_rcp28_pd(A) \
+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                      (int)(R)); })
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (int)(R)); })
+
+#define _mm512_rcp28_ps(A) \
+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)(__m128)(S), \
+                                            (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)(M), (int)(R)); })
+
+#define _mm_rcp28_ss(A, B) \
+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)(__m128d)(S), \
+                                             (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)(M), (int)(R)); })
+
+#define _mm_rcp28_sd(A, B) \
+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif // __AVX512ERINTRIN_H
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512fintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512fintrin.h
new file mode 100644
index 000000000..e6a7217c8
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512fintrin.h
@@ -0,0 +1,10244 @@
+/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FINTRIN_H
+#define __AVX512FINTRIN_H
+
+typedef char __v64qi __attribute__((__vector_size__(64)));
+typedef short __v32hi __attribute__((__vector_size__(64)));
+typedef double __v8df __attribute__((__vector_size__(64)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef long long __v8di __attribute__((__vector_size__(64)));
+typedef int __v16si __attribute__((__vector_size__(64)));
+
+/* Unsigned types */
+typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
+typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
+typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
+typedef unsigned int __v16su __attribute__((__vector_size__(64)));
+
+typedef float __m512 __attribute__((__vector_size__(64)));
+typedef double __m512d __attribute__((__vector_size__(64)));
+typedef long long __m512i __attribute__((__vector_size__(64)));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+/* Rounding mode macros.  */
+#define _MM_FROUND_TO_NEAREST_INT   0x00
+#define _MM_FROUND_TO_NEG_INF       0x01
+#define _MM_FROUND_TO_POS_INF       0x02
+#define _MM_FROUND_TO_ZERO          0x03
+#define _MM_FROUND_CUR_DIRECTION    0x04
+
+/* Constants for integer comparison predicates */
+typedef enum {
+    _MM_CMPINT_EQ,      /* Equal */
+    _MM_CMPINT_LT,      /* Less than */
+    _MM_CMPINT_LE,      /* Less than or Equal */
+    _MM_CMPINT_UNUSED,
+    _MM_CMPINT_NE,      /* Not Equal */
+    _MM_CMPINT_NLT,     /* Not Less than */
+#define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
+    _MM_CMPINT_NLE      /* Not Less than or Equal */
+#define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
+} _MM_CMPINT_ENUM;
+
+typedef enum
+{
+  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+  _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
+  _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
+  _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
+  _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
+  _MM_MANT_SIGN_zero,   /* sign = 0             */
+  _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+
+/* Create vectors with repeated elements */
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_si512(void)
+{
+  return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+#define _mm512_setzero_epi32 _mm512_setzero_si512
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_undefined_pd(void)
+{
+  return (__m512d)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined(void)
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined_ps(void)
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_undefined_epi32(void)
+{
+  return (__m512i)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v4si) __A,
+                                          (__v4si)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M,
+                                             (__v16si) _mm512_broadcastd_epi32(__A),
+                                             (__v16si) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M,
+                                             (__v16si) _mm512_broadcastd_epi32(__A),
+                                             (__v16si) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v2di) __A,
+                                          (__v2di) _mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di) _mm512_broadcastq_epi64(__A),
+                                             (__v8di) __O);
+
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di) _mm512_broadcastq_epi64(__A),
+                                             (__v8di) _mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+#ifdef __x86_64__
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#else
+  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#endif
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_setzero_ps(void)
+{
+  return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+#define _mm512_setzero _mm512_setzero_ps
+
+static  __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_setzero_pd(void)
+{
+  return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_set1_ps(float __w)
+{
+  return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
+                   __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_set1_pd(double __w)
+{
+  return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi8(char __w)
+{
+  return (__m512i)(__v64qi){ __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi16(short __w)
+{
+  return (__m512i)(__v32hi){ __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi32(int __s)
+{
+  return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
+                             __s, __s, __s, __s, __s, __s, __s, __s };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi64(long long __d)
+{
+  return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcastss_ps(__m128 __A)
+{
+  return (__m512)__builtin_shufflevector((__v4sf) __A,
+                                         (__v4sf)_mm_undefined_ps(),
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
+{
+  return  (__m512i)(__v16si)
+   { __D, __C, __B, __A, __D, __C, __B, __A,
+     __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set4_epi64 (long long __A, long long __B, long long __C,
+       long long __D)
+{
+  return  (__m512i) (__v8di)
+   { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+  return  (__m512d)
+   { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+  return  (__m512)
+   { __D, __C, __B, __A, __D, __C, __B, __A,
+     __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+#define _mm512_setr4_epi32(e0,e1,e2,e3)               \
+  _mm512_set4_epi32((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_epi64(e0,e1,e2,e3)               \
+  _mm512_set4_epi64((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_pd(e0,e1,e2,e3)                \
+  _mm512_set4_pd((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_ps(e0,e1,e2,e3)                \
+  _mm512_set4_ps((e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcastsd_pd(__m128d __A)
+{
+  return (__m512d)__builtin_shufflevector((__v2df) __A,
+                                          (__v2df) _mm_undefined_pd(),
+                                          0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/* Cast between vector types */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm512_castpd512_pd128(__m512d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm512_castpd512_pd256 (__m512d __A)
+{
+  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm512_castps512_ps128(__m512 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm512_castps512_ps256 (__m512 __A)
+{
+  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castpd_ps (__m512d __A)
+{
+  return (__m512) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_castpd_si512 (__m512d __A)
+{
+  return (__m512i) (__A);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd128_pd512 (__m128d __A)
+{
+  return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castps_pd (__m512 __A)
+{
+  return (__m512d) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_castps_si512 (__m512 __A)
+{
+  return (__m512i) (__A);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_castps128_ps512 (__m128 __A)
+{
+    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi128_si512 (__m128i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi256_si512 (__m256i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castsi512_ps (__m512i __A)
+{
+  return (__m512) (__A);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castsi512_pd (__m512i __A)
+{
+  return (__m512d) (__A);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm512_castsi512_si128 (__m512i __A)
+{
+  return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_castsi512_si256 (__m512i __A)
+{
+  return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_int2mask(int __a)
+{
+  return (__mmask16)__a;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask2int(__mmask16 __a)
+{
+  return (int)__a;
+}
+
+/* Bitwise operators */
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi32(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v16su)__a & (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                (__v16si) _mm512_and_epi32(__a, __b),
+                (__v16si) __src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
+                                         __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi64(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a & (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+    return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
+                (__v8di) _mm512_and_epi64(__a, __b),
+                (__v8di) __src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
+                                         __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i)(~(__v16su)(__A) & (__v16su)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_andnot_epi32(__A, __B),
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_andnot_epi64(__A, __B),
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi32(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v16su)__a | (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                                             (__v16si)_mm512_or_epi32(__a, __b),
+                                             (__v16si)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi64(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a | (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+                                             (__v8di)_mm512_or_epi64(__a, __b),
+                                             (__v8di)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi32(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v16su)__a ^ (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                                            (__v16si)_mm512_xor_epi32(__a, __b),
+                                            (__v16si)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi64(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a ^ (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+                                             (__v8di)_mm512_xor_epi64(__a, __b),
+                                             (__v8di)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_si512(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a & (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_si512(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a | (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_si512(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a ^ (__v8du)__b);
+}
+
+/* Arithmetic */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_add_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a + (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_add_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a + (__v16sf)__b);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mul_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a * (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mul_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a * (__v16sf)__b);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_sub_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a - (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_sub_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a - (__v16sf)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A + (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A - (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A + (__v16su) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A - (__v16su) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_max_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_max_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_undefined_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_max_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_max_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_max_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+#define _mm512_max_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_undefined_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_max_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_max_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_max_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_max_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_max_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_max_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+#define _mm512_mask_min_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_min_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_min_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_undefined_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_min_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_min_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_min_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+#define _mm512_min_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_undefined_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_min_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_min_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_min_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_min_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_min_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_min_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)__W);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epu32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)__W);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A * (__v16su) __B);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)__W);
+}
+
+#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_sqrt_round_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm512_sqrt_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_undefined_pd(), \
+                                         (__mmask8)-1, (int)(R)); })
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_sqrt_pd(__m512d __a)
+{
+  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
+                                                (__v8df) _mm512_setzero_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                   (__v8df) __W,
+                   (__mmask8) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                   (__v8df)
+                   _mm512_setzero_pd (),
+                   (__mmask8) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_sqrt_round_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_sqrt_round_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(U), (int)(R)); })
+
+#define _mm512_sqrt_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_undefined_ps(), \
+                                        (__mmask16)-1, (int)(R)); })
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_sqrt_ps(__m512 __a)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                 (__v8df)
+                 _mm512_setzero_pd (),
+                 (__mmask8) -1);}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rcp14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+               (__v8df)
+               _mm512_setzero_pd (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rcp14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+              (__v16sf)
+              _mm512_setzero_ps (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                   (__v16sf)
+                   _mm512_setzero_ps (),
+                   (__mmask16) __U);
+}
+
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df)
+            _mm_setzero_pd (),
+            (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_floor_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_FLOOR,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                   _MM_FROUND_FLOOR,
+                   (__v16sf) __W, __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_floor_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_FLOOR,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                _MM_FROUND_FLOOR,
+                (__v8df) __W, __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                   _MM_FROUND_CEIL,
+                   (__v16sf) __W, __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_ceil_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_CEIL,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_ceil_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_CEIL,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                _MM_FROUND_CEIL,
+                (__v8df) __W, __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi64(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi32(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_add_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_add_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_add_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_add_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_add_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_add_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_add_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_add_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_add_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_add_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_add_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_sub_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_sub_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_sub_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_sub_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_sub_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_sub_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_sub_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_sub_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_sub_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_sub_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
+
+#define _mm512_maskz_sub_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_mul_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_mul_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_mul_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mul_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_mul_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_mul_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_mul_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_mul_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_mul_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_mul_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
+
+#define _mm512_maskz_mul_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_div_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_div_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_div_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_div_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_div_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a/(__v8df)__b);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_div_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a/(__v16sf)__b);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_div_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_div_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_div_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_div_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
+
+#define _mm512_maskz_div_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
+
+#define _mm512_roundscale_ps(A, B) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
+                                         (__v16sf)(__m512)(A), (__mmask16)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_ps(A, B, C, imm) __extension__ ({\
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_roundscale_ps(A, B, imm) __extension__ ({\
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(A), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(A), (int)(R)); })
+
+#define _mm512_roundscale_round_ps(A, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                         (__v16sf)_mm512_undefined_ps(), \
+                                         (__mmask16)-1, (int)(R)); })
+
+#define _mm512_roundscale_pd(A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
+                                          (__v8df)(__m512d)(A), (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_pd(A, B, C, imm) __extension__ ({\
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_roundscale_pd(A, B, imm) __extension__ ({\
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(A), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(A), (int)(R)); })
+
+#define _mm512_roundscale_round_pd(A, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                          (__v8df)_mm512_undefined_pd(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
+                                           (int)(R)); })
+
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
+                                           (int)(R)); })
+
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
+                                          (int)(R)); })
+
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
+                                          (int)(R)); })
+
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               -(__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        -(__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              -(__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       -(__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+
+
+/* Vector permutations */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16si) __A,
+                                                       (__v16si) __B,
+                                                       (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
+                                __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16si) __A,
+                                                        (__v16si) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
+                                 __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16si) __A,
+                                                        (__v16si) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
+                                __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) __U);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
+                                                        /* idx */ ,
+                                                        (__v8di) __A,
+                                                        (__v8di) __B,
+                                                        (__mmask8) __U);
+}
+
+#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
+                                   (__v8di)(__m512i)(A), \
+                                   ((int)(I) & 0x7) + 0, \
+                                   ((int)(I) & 0x7) + 1, \
+                                   ((int)(I) & 0x7) + 2, \
+                                   ((int)(I) & 0x7) + 3, \
+                                   ((int)(I) & 0x7) + 4, \
+                                   ((int)(I) & 0x7) + 5, \
+                                   ((int)(I) & 0x7) + 6, \
+                                   ((int)(I) & 0x7) + 7); })
+
+#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)(__m512i)(W)); })
+
+#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)_mm512_setzero_si512()); })
+
+#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
+                                   (__v16si)(__m512i)(A), \
+                                   ((int)(I) & 0xf) + 0, \
+                                   ((int)(I) & 0xf) + 1, \
+                                   ((int)(I) & 0xf) + 2, \
+                                   ((int)(I) & 0xf) + 3, \
+                                   ((int)(I) & 0xf) + 4, \
+                                   ((int)(I) & 0xf) + 5, \
+                                   ((int)(I) & 0xf) + 6, \
+                                   ((int)(I) & 0xf) + 7, \
+                                   ((int)(I) & 0xf) + 8, \
+                                   ((int)(I) & 0xf) + 9, \
+                                   ((int)(I) & 0xf) + 10, \
+                                   ((int)(I) & 0xf) + 11, \
+                                   ((int)(I) & 0xf) + 12, \
+                                   ((int)(I) & 0xf) + 13, \
+                                   ((int)(I) & 0xf) + 14, \
+                                   ((int)(I) & 0xf) + 15); })
+
+#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)(__m512i)(W)); })
+
+#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)_mm512_setzero_si512()); })
+/* Vector Extract */
+
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({             \
+  (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   ((I) & 1) ? 4 : 0,             \
+                                   ((I) & 1) ? 5 : 1,             \
+                                   ((I) & 1) ? 6 : 2,             \
+                                   ((I) & 1) ? 7 : 3); })
+
+#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                   (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+                                   (__v4df)(W)); })
+
+#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                   (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+                                   (__v4df)_mm256_setzero_pd()); })
+
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({             \
+  (__m128)__builtin_shufflevector((__v16sf)(__m512)(A),           \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                  0 + ((I) & 0x3) * 4,            \
+                                  1 + ((I) & 0x3) * 4,            \
+                                  2 + ((I) & 0x3) * 4,            \
+                                  3 + ((I) & 0x3) * 4); })
+
+#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)(W)); })
+
+#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)_mm_setzero_ps()); })
+
+/* Vector Blend */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+                 (__v8df) __W,
+                 (__v8df) __A);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+                (__v16sf) __W,
+                (__v16sf) __A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                (__v8di) __W,
+                (__v8di) __A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                (__v16si) __W,
+                (__v16si) __A);
+}
+
+/* Compare */
+
+#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(P), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(P), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_cmp_ps_mask(A, B, P) \
+  _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmpeq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), (int)(P), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), (int)(P), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm512_cmp_pd_mask(A, B, P) \
+  _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmpeq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
+
+/* Conversion */
+
+#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_undefined_epi32(), \
+                                             (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)(__m512i)(W), \
+                                             (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_setzero_si512(), \
+                                             (__mmask16)(U), (int)(R)); })
+
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu32(__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                   (__v16si) __W,
+                   (__mmask16) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                   (__v16si) _mm512_setzero_si512 (),
+                   (__mmask16) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) _mm512_undefined_ps (),
+                 (__mmask16) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_pd(__m256i __A)
+{
+  return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) _mm512_undefined_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) __W,
+                (__mmask16) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) _mm512_setzero_ps (),
+                (__mmask16) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_pd(__m256i __A)
+{
+  return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_ps (__m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) _mm256_undefined_ps (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_pslo (__m512d __A)
+{
+  return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
+{
+  return (__m512) __builtin_shufflevector (
+                (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
+                                               __U, __A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_undefined_si256(), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)(__m256i)(U), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_maskz_cvt_roundps_ph(W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_cvtps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_cvtps_ph(U, W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)(__m256i)(U), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_maskz_cvtps_ph(W, A, I) __extension__ ({\
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_cvt_roundph_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundph_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundph_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static  __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtph_ps(__m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                 (__v16sf) _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)(__m256i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi32(__m512d __a)
+{
+  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
+                                                   (__v8si)_mm256_setzero_si256(),
+                                                   (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                  (__v8si) _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)(__m512i)(W), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)(U), (int)(R)); })
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi32(__m512 __a)
+{
+  return (__m512i)
+    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
+                                     (__v16si) _mm512_setzero_si512 (),
+                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                  (__v16si) _mm512_setzero_si512 (),
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)(__m512i)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epi32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si) _mm512_undefined_epi32 (),
+                 (__mmask16) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)(__m256i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si)
+                 _mm256_undefined_si256 (),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)(__m512i)(W), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epu32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu32 ( __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
+                  (__v16si)\
+                  _mm512_undefined_epi32 (),\
+                  (__mmask16) -1,\
+                  _MM_FROUND_CUR_DIRECTION);\
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                  (__v16si) 
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U ,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_undefined_si256 (),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+/* Unpack and Interleave */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpackhi_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
+                                           (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
+                                           (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpacklo_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
+                                           (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
+                                           (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpackhi_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+                                         2,    18,    3,    19,
+                                         2+4,  18+4,  3+4,  19+4,
+                                         2+8,  18+8,  3+8,  19+8,
+                                         2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
+                                          (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
+                                          (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpacklo_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+                                         0,    16,    1,    17,
+                                         0+4,  16+4,  1+4,  17+4,
+                                         0+8,  16+8,  1+8,  17+8,
+                                         0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
+                                          (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
+                                          (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+                                          2,    18,    3,    19,
+                                          2+4,  18+4,  3+4,  19+4,
+                                          2+8,  18+8,  3+8,  19+8,
+                                          2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
+                                       (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
+                                       (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+                                          0,    16,    1,    17,
+                                          0+4,  16+4,  1+4,  17+4,
+                                          0+8,  16+8,  1+8,  17+8,
+                                          0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
+                                       (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
+                                       (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
+                                        (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
+                                        (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
+                                        (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
+                                        (__v8di)_mm512_setzero_si512());
+}
+
+/* Bit Test */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_test_epi32_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+            (__v16si) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+                 (__v16si) __B, __U);
+}
+
+static __inline __mmask8 __DEFAULT_FN_ATTRS
+_mm512_test_epi64_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+                 (__v8di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
+}
+
+
+/* SIMD load ops */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_loadu_si512 (void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m512d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m512 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_load_ps(float const *__p)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_load_pd(double const *__p)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+                          (__v8df) __W,
+                          (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_si512 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_epi32 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_epi64 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+/* SIMD store ops */
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
+                                     (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
+            (__mmask16) -1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
+                                     (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_pd(void *__P, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_ps(void *__P, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_pd(void *__P, __m512d __A)
+{
+  *(__m512d*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_ps(void *__P, __m512 __A)
+{
+  *(__m512*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_si512 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_epi32 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_epi64 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+/* Mask ops */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_knot(__mmask16 __M)
+{
+  return __builtin_ia32_knothi(__M);
+}
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi32(__m128i __A)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi64(__m128i __A)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi64(__m256i __X)
+{
+  return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi32(__m256i __A)
+{
+  return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi64(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi32(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi64(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_epi64(__m256i __X)
+{
+  return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu16_epi32(__m256i __A)
+{
+  return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu16_epi64(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rorv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rorv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+
+
+#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (int)(p), \
+                                         (__mmask16)-1); })
+
+#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
+                                          (__mmask16)-1); })
+
+#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm512_rol_epi32(a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)-1); })
+
+#define _mm512_mask_rol_epi32(W, U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)(__m512i)(W), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_maskz_rol_epi32(U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_rol_epi64(a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)-1); })
+
+#define _mm512_mask_rol_epi64(W, U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+
+#define _mm512_maskz_rol_epi64(U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rolv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rolv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+#define _mm512_ror_epi32(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)-1); })
+
+#define _mm512_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)(__m512i)(W), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_ror_epi64(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)-1); })
+
+#define _mm512_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+
+#define _mm512_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)(U)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
+          (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                 (__v16si) __A,
+                 (__v16si) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                 (__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                 (__v8di) __A,
+                 (__v8di) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                 (__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_movedup_pd (__m512d __A)
+{
+  return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
+                                          0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_movedup_pd(__A),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_movedup_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_fixupimm_round_pd(A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)-1, \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), \
+                                              (int)(imm), (__mmask8)(U), \
+                                              (int)(R)); })
+
+#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), \
+                                              (int)(imm), (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_fixupimm_round_ps(A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1, \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U), \
+                                             (int)(R)); })
+
+#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_fixupimm_round_sd(A, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm_fixupimm_sd(A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_fixupimm_sd(A, U, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_fixupimm_round_ss(A, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm_fixupimm_ss(A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_fixupimm_ss(A, U, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_getexp_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)-1, (int)(R)); })
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
+                 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)(__m128d)(W), \
+                                                 (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sd(U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U), (int)(R)); })
+
+#define _mm_getexp_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+                (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)(__m128)(W), \
+                                                (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_ss(U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)(U), (int)(R)); })
+
+#define _mm_getmant_round_sd(A, B, C, D, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)); })
+
+#define _mm_getmant_sd(A, B, C, D)  __extension__ ({ \
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_sd(W, U, A, B, C, D) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R)({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_getmant_sd(U, A, B, C, D) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), (int)(R)); })
+
+#define _mm_getmant_round_ss(A, B, C, D, R) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_getmant_ss(A, B, C, D) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_ss(W, U, A, B, C, D) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R)({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_getmant_ss(U, A, B, C, D) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_pd(), \
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kmov (__mmask16 __A)
+{
+  return  __A;
+}
+
+#define _mm_comi_round_sd(A, B, P, R) __extension__ ({\
+  (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
+                              (int)(P), (int)(R)); })
+
+#define _mm_comi_round_ss(A, B, P, R) __extension__ ({\
+  (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
+                              (int)(P), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
+         __mmask16 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16si) __B,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi32(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi64(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sll_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sll_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi64(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi32(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi64(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi64(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi32(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi64(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                            (__v16si)(__m512i)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                            (__v16si)(__m512i)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U)); })
+
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
+                                             (__v16si)(__m512i)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U)); })
+
+#define _mm512_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+                                            (__v8di)(__m512i)(B), \
+                                            (__v8di)(__m512i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+                                            (__v8di)(__m512i)(B), \
+                                            (__v8di)(__m512i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
+                                             (__v8di)(__m512i)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
+
+#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvt_roundsd_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvt_roundsd_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
+                                                  (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#endif
+
+#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
+                                                  (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvtt_roundsd_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_i32 (__m128d __A)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvtt_roundsd_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_i64 (__m128d __A)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
+                                                   (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvtt_roundss_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_i32 (__m128 __A)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvtt_roundss_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_i64 (__m128 __A)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
+                                                   (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
+              (__v8di) __I
+              /* idx */ ,
+              (__v8df) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16sf) __B,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
+         __mmask8 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
+                   (__v8di) __I
+                   /* idx */ ,
+                   (__v8di) __B,
+                   (__mmask8) __U);
+}
+
+#define _mm512_permute_pd(X, C) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1), \
+                                   4 + (((C) >> 4) & 0x1), \
+                                   4 + (((C) >> 5) & 0x1), \
+                                   6 + (((C) >> 6) & 0x1), \
+                                   6 + (((C) >> 7) & 0x1)); })
+
+#define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permute_pd((X), (C)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permute_pd((X), (C)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_permute_ps(X, C) __extension__ ({ \
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                   0  + (((C) >> 0) & 0x3), \
+                                   0  + (((C) >> 2) & 0x3), \
+                                   0  + (((C) >> 4) & 0x3), \
+                                   0  + (((C) >> 6) & 0x3), \
+                                   4  + (((C) >> 0) & 0x3), \
+                                   4  + (((C) >> 2) & 0x3), \
+                                   4  + (((C) >> 4) & 0x3), \
+                                   4  + (((C) >> 6) & 0x3), \
+                                   8  + (((C) >> 0) & 0x3), \
+                                   8  + (((C) >> 2) & 0x3), \
+                                   8  + (((C) >> 4) & 0x3), \
+                                   8  + (((C) >> 6) & 0x3), \
+                                   12 + (((C) >> 0) & 0x3), \
+                                   12 + (((C) >> 2) & 0x3), \
+                                   12 + (((C) >> 4) & 0x3), \
+                                   12 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_permute_ps((X), (C)), \
+                                      (__v16sf)(__m512)(W)); })
+
+#define _mm512_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_permute_ps((X), (C)), \
+                                      (__v16sf)_mm512_setzero_ps()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutevar_pd(__m512d __A, __m512i __C)
+{
+  return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutevar_ps(__m512 __A, __m512i __C)
+{
+  return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                    /* idx */ ,
+                    (__v8df) __A,
+                    (__v8df) __B,
+                    (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                    /* idx */ ,
+                    (__v8df) __A,
+                    (__v8df) __B,
+                    (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
+                                                         /* idx */ ,
+                                                         (__v8df) __A,
+                                                         (__v8df) __B,
+                                                         (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                         /* idx */ ,
+                                                         (__v16sf) __A,
+                                                         (__v16sf) __B,
+                                                         (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                         /* idx */ ,
+                                                         (__v16sf) __A,
+                                                         (__v16sf) __B,
+                                                         (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16sf) __A,
+                                                        (__v16sf) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B, __U);
+}
+
+#define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_undefined_si256(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)(__m256i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_undefined_si256 (),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_roundscale_round_sd(A, B, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                (int)(R)); })
+
+#define _mm_roundscale_sd(A, B, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_sd(W, U, A, B, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), (int)(imm), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)); })
+
+#define _mm_maskz_roundscale_sd(U, A, B, I) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)); })
+
+#define _mm_roundscale_round_ss(A, B, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(imm), \
+                                               (int)(R)); })
+
+#define _mm_roundscale_ss(A, B, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(imm), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_ss(W, U, A, B, I) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), (int)(I), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), (int)(I), \
+                                               (int)(R)); })
+
+#define _mm_maskz_roundscale_ss(U, A, B, I) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(I), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(I), \
+                                               (int)(R)); })
+
+#define _mm512_scalef_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_scalef_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
+              (__v2df)( __B), (__v2df) _mm_setzero_pd(),
+              (__mmask8) -1,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_scalef_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
+             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+                                         (__v16si)_mm512_srai_epi32(__A, __B), \
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+                                         (__v16si)_mm512_srai_epi32(__A, __B), \
+                                         (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+                                          (__v8di)_mm512_srai_epi64(__A, __B), \
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+                                          (__v8di)_mm512_srai_epi64(__A, __B), \
+                                          (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_undefined_ps(), \
+                                         (__mmask16)-1); })
+
+#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)(__m512)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_undefined_pd(), \
+                                          (__mmask8)-1); })
+
+#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)(__m512d)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)_mm512_setzero_si512(), \
+                                          (__mmask16)-1); })
+
+#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)(__m512i)(W), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)_mm512_setzero_si512(), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)_mm512_setzero_si512(), \
+                                          (__mmask8)-1); })
+
+#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)(__m512i)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)_mm512_setzero_si512(), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_shuffle_pd(A, B, M) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), \
+                                   0  + (((M) >> 0) & 0x1), \
+                                   8  + (((M) >> 1) & 0x1), \
+                                   2  + (((M) >> 2) & 0x1), \
+                                   10 + (((M) >> 3) & 0x1), \
+                                   4  + (((M) >> 4) & 0x1), \
+                                   12 + (((M) >> 5) & 0x1), \
+                                   6  + (((M) >> 6) & 0x1), \
+                                   14 + (((M) >> 7) & 0x1)); })
+
+#define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_shuffle_ps(A, B, M) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), \
+                                   0  + (((M) >> 0) & 0x3), \
+                                   0  + (((M) >> 2) & 0x3), \
+                                   16 + (((M) >> 4) & 0x3), \
+                                   16 + (((M) >> 6) & 0x3), \
+                                   4  + (((M) >> 0) & 0x3), \
+                                   4  + (((M) >> 2) & 0x3), \
+                                   20 + (((M) >> 4) & 0x3), \
+                                   20 + (((M) >> 6) & 0x3), \
+                                   8  + (((M) >> 0) & 0x3), \
+                                   8  + (((M) >> 2) & 0x3), \
+                                   24 + (((M) >> 4) & 0x3), \
+                                   24 + (((M) >> 6) & 0x3), \
+                                   12 + (((M) >> 0) & 0x3), \
+                                   12 + (((M) >> 2) & 0x3), \
+                                   28 + (((M) >> 4) & 0x3), \
+                                   28 + (((M) >> 6) & 0x3)); })
+
+#define _mm512_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                      (__v16sf)(__m512)(W)); })
+
+#define _mm512_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                      (__v16sf)_mm512_setzero_ps()); })
+
+#define _mm_sqrt_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm_sqrt_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x4 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_undefined_ps (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf) __O,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x4 (__m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_undefined_pd (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df) __O,
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x4 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_undefined_epi32 (),
+                  (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x4 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_undefined_epi32 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512(__M,
+                                              (__v8df) _mm512_broadcastsd_pd(__A),
+                                              (__v8df) __O);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512(__M,
+                                              (__v8df) _mm512_broadcastsd_pd(__A),
+                                              (__v8df) _mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512(__M,
+                                             (__v16sf) _mm512_broadcastss_ps(__A),
+                                             (__v16sf) __O);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512(__M,
+                                             (__v16sf) _mm512_broadcastss_ps(__A),
+                                             (__v16sf) _mm512_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_undefined_si256 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_undefined_si256 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_undefined_si256 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_undefined_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({            \
+  (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   0 + ((imm) & 0x3) * 4,             \
+                                   1 + ((imm) & 0x3) * 4,             \
+                                   2 + ((imm) & 0x3) * 4,             \
+                                   3 + ((imm) & 0x3) * 4); })
+
+#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+                                (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)__W); })
+
+#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+                                (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)_mm_setzero_si128()); })
+
+#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({           \
+  (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A),             \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   ((imm) & 1) ? 4 : 0,              \
+                                   ((imm) & 1) ? 5 : 1,              \
+                                   ((imm) & 1) ? 6 : 2,              \
+                                   ((imm) & 1) ? 7 : 3); })
+
+#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+                                (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+                                (__v4di)__W); })
+
+#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+                                (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+                                (__v4di)_mm256_setzero_si256()); })
+
+#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                 (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)(W)); })
+
+#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+                                 (__v8di)_mm512_castsi256_si512((__m256i)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)(W)); })
+
+#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512()); })
+
+#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                  (__v16sf)_mm512_castps128_ps512((__m128)(B)),\
+                                  (((imm) & 0x3) == 0) ? 16 :  0, \
+                                  (((imm) & 0x3) == 0) ? 17 :  1, \
+                                  (((imm) & 0x3) == 0) ? 18 :  2, \
+                                  (((imm) & 0x3) == 0) ? 19 :  3, \
+                                  (((imm) & 0x3) == 1) ? 16 :  4, \
+                                  (((imm) & 0x3) == 1) ? 17 :  5, \
+                                  (((imm) & 0x3) == 1) ? 18 :  6, \
+                                  (((imm) & 0x3) == 1) ? 19 :  7, \
+                                  (((imm) & 0x3) == 2) ? 16 :  8, \
+                                  (((imm) & 0x3) == 2) ? 17 :  9, \
+                                  (((imm) & 0x3) == 2) ? 18 : 10, \
+                                  (((imm) & 0x3) == 2) ? 19 : 11, \
+                                  (((imm) & 0x3) == 3) ? 16 : 12, \
+                                  (((imm) & 0x3) == 3) ? 17 : 13, \
+                                  (((imm) & 0x3) == 3) ? 18 : 14, \
+                                  (((imm) & 0x3) == 3) ? 19 : 15); })
+
+#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)(W)); })
+
+#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps()); })
+
+#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                 (__v16si)_mm512_castsi128_si512((__m128i)(B)),\
+                                 (((imm) & 0x3) == 0) ? 16 :  0, \
+                                 (((imm) & 0x3) == 0) ? 17 :  1, \
+                                 (((imm) & 0x3) == 0) ? 18 :  2, \
+                                 (((imm) & 0x3) == 0) ? 19 :  3, \
+                                 (((imm) & 0x3) == 1) ? 16 :  4, \
+                                 (((imm) & 0x3) == 1) ? 17 :  5, \
+                                 (((imm) & 0x3) == 1) ? 18 :  6, \
+                                 (((imm) & 0x3) == 1) ? 19 :  7, \
+                                 (((imm) & 0x3) == 2) ? 16 :  8, \
+                                 (((imm) & 0x3) == 2) ? 17 :  9, \
+                                 (((imm) & 0x3) == 2) ? 18 : 10, \
+                                 (((imm) & 0x3) == 2) ? 19 : 11, \
+                                 (((imm) & 0x3) == 3) ? 16 : 12, \
+                                 (((imm) & 0x3) == 3) ? 17 : 13, \
+                                 (((imm) & 0x3) == 3) ? 18 : 14, \
+                                 (((imm) & 0x3) == 3) ? 19 : 15); })
+
+#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)(W)); })
+
+#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512()); })
+
+#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_undefined_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_getmant_pd(A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_getmant_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_getmant_ps(A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_getexp_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_getexp_round_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_getexp_round_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_getexp_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_getexp_round_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_getexp_round_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_getexp_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_i64gather_ps(index, addr, scale) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
+                                  __addr, __scale) __extension__({\
+__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
+                              __addr,(__v8di) __index, __mask, __scale);\
+})
+
+#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
+                                        (int const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)-1, (int)(scale)); })
+
+#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64gather_pd(index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
+                                       (double const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
+                                       (double const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64gather_epi64(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_pd(), \
+                                       (long long const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
+                                       (long long const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_ps(index, addr, scale) __extension__ ({\
+  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
+                                       (float const *)(addr), \
+                                       (__v16sf)(__m512)(index), \
+                                       (__mmask16)-1, (int)(scale)); })
+
+#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v16sf)(__m512)(index), \
+                                       (__mmask16)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_epi32(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
+                                        (int const *)(addr), \
+                                        (__v16si)(__m512i)(index), \
+                                        (__mmask16)-1, (int)(scale)); })
+
+#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v16si)(__m512i)(index), \
+                                        (__mmask16)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_pd(index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
+                                       (double const *)(addr), \
+                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
+                                       (double const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_epi64(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
+                                       (long long const *)(addr), \
+                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
+                                       (long long const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64scatter_ps(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \
+                                (__v8di)(__m512i)(index), \
+                                (__v8sf)(__m256)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \
+                                (__v8di)(__m512i)(index), \
+                                (__v8sf)(__m256)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_epi32(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \
+                                (__v8di)(__m512i)(index), \
+                                (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \
+                                (__v8di)(__m512i)(index), \
+                                (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_pd(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \
+                               (__v8di)(__m512i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \
+                               (__v8di)(__m512i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_epi64(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \
+                               (__v8di)(__m512i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \
+                               (__v8di)(__m512i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_ps(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \
+                                (__v16si)(__m512i)(index), \
+                                (__v16sf)(__m512)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \
+                                (__v16si)(__m512i)(index), \
+                                (__v16sf)(__m512)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_epi32(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \
+                                (__v16si)(__m512i)(index), \
+                                (__v16si)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \
+                                (__v16si)(__m512i)(index), \
+                                (__v16si)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_pd(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_epi64(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          (__v4sf) __A,
+          (__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          (__v4sf) __A,
+          -(__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          -(__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          -(__v4sf) __A,
+          (__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          -(__v4sf) __A,
+          -(__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
+          (__v4sf) __B,
+          -(__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
+  (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          (__v2df) __A,
+          (__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          (__v2df) __A,
+          -(__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          -(__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          -(__v2df) __A,
+          (__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          -(__v2df) __A,
+          -(__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
+          (__v2df) __B,
+          -(__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
+          (__v2df) __X,
+          (__v2df) (__Y),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
+  (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm512_permutex_pd(X, C) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permutex_pd((X), (C)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_permutex_pd(U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permutex_pd((X), (C)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_permutex_epi64(X, C) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                      (__v8di)(__m512i)(W)); })
+
+#define _mm512_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                      (__v8di)_mm512_setzero_si512()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_undefined_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_undefined_epi32 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) __W,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_undefined_ps (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_setzero_ps (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_undefined_epi32 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) __W,
+                 __M);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+  return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+  return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_si512 (__m512i * __P, __m512i __A)
+{
+  __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_stream_load_si512 (void *__P)
+{
+  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_pd (double *__P, __m512d __A)
+{
+  __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_ps (float *__P, __m512 __A)
+{
+  __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U);
+}
+
+#define _mm_cmp_round_ss_mask(X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm_cmp_ss_mask(X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)-1, \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)(M), \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_cmp_round_sd_mask(X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm_cmp_sd_mask(X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)-1, \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)(M), \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_movehdup_ps (__m512 __A)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+                         1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_movehdup_ps(__A),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_movehdup_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_moveldup_ps (__m512 __A)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+                         0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_moveldup_ps(__A),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_moveldup_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  __m128 res = __A; 
+  res[0] = (__U & 1) ? __B[0] : __W[0];
+  return res; 
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  __m128 res = __A; 
+  res[0] = (__U & 1) ? __B[0] : 0; 
+  return res; 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  __m128d res = __A; 
+  res[0] = (__U & 1) ? __B[0] : __W[0];
+  return res; 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  __m128d res = __A; 
+  res[0] = (__U & 1) ? __B[0] : 0; 
+  return res; 
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storess128_mask ((__v16sf *)__W, 
+                (__v16sf) _mm512_castps128_ps512(__A),
+                (__mmask16) __U & (__mmask16)1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storesd128_mask ((__v8df *)__W, 
+                (__v8df) _mm512_castpd128_pd512(__A),
+                (__mmask8) __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
+{
+  __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
+                                                (__v4sf) {0.0, 0.0, 0.0, 0.0},
+                                                0, 4, 4, 4);
+
+  return (__m128) __builtin_shufflevector(
+                           __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+                                      (__v16sf) _mm512_castps128_ps512(src),
+                                      (__mmask16) __U & 1),
+                           _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ss (__mmask8 __U, const float* __A)
+{
+  return (__m128) __builtin_shufflevector(
+                           __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+                                      (__v16sf) _mm512_setzero_ps(),
+                                      (__mmask16) __U & 1),
+                           _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
+{
+  __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
+                                                 (__v2df) {0.0, 0.0}, 0, 2);
+
+  return (__m128d) __builtin_shufflevector(
+                            __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+                                      (__v8df) _mm512_castpd128_pd512(src),
+                                      (__mmask8) __U & 1),
+                            _mm512_undefined_pd(), 0, 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_sd (__mmask8 __U, const double* __A)
+{
+  return (__m128d) __builtin_shufflevector(
+                            __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+                                      (__v8df) _mm512_setzero_pd(),
+                                      (__mmask8) __U & 1),
+                            _mm512_undefined_pd(), 0, 1);
+}
+
+#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   0  + (((I) >> 0) & 0x3), \
+                                   0  + (((I) >> 2) & 0x3), \
+                                   0  + (((I) >> 4) & 0x3), \
+                                   0  + (((I) >> 6) & 0x3), \
+                                   4  + (((I) >> 0) & 0x3), \
+                                   4  + (((I) >> 2) & 0x3), \
+                                   4  + (((I) >> 4) & 0x3), \
+                                   4  + (((I) >> 6) & 0x3), \
+                                   8  + (((I) >> 0) & 0x3), \
+                                   8  + (((I) >> 2) & 0x3), \
+                                   8  + (((I) >> 4) & 0x3), \
+                                   8  + (((I) >> 6) & 0x3), \
+                                   12 + (((I) >> 0) & 0x3), \
+                                   12 + (((I) >> 2) & 0x3), \
+                                   12 + (((I) >> 4) & 0x3), \
+                                   12 + (((I) >> 6) & 0x3)); })
+
+#define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)(__m512i)(W)); })
+
+#define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)_mm512_setzero_si512()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) _mm512_setzero_ps(),
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) _mm512_setzero_ps(),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) _mm512_setzero_ps(),
+                (__mmask16) __U);
+}
+
+#define _mm512_cvt_roundps_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtps_pd (__m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpslo_pd (__m512 __A)
+{
+  return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
+{
+  return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+              (__v8df) __A,
+              (__v8df) __W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+              (__v8df) __A,
+              (__v8df) _mm512_setzero_pd ());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+             (__v16sf) __A,
+             (__v16sf) __W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+             (__v16sf) __A,
+             (__v16sf) _mm512_setzero_ps ());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+            (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+            (__mmask16) __U);
+}
+
+#define _mm_cvt_roundsd_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)_mm_undefined_ps(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
+{
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
+                                             (__v2df)(__B),
+                                             (__v4sf)(__W), 
+                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
+{
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
+                                             (__v2df)(__B),
+                                             (__v4sf)_mm_setzero_ps(), 
+                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtss_i32 _mm_cvtss_si32
+#define _mm_cvtsd_i32 _mm_cvtsd_si32
+#define _mm_cvti32_sd _mm_cvtsi32_sd
+#define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
+#define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
+
+#ifdef __x86_64__
+#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                     (int)(R)); })
+
+#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                     (int)(R)); })
+#endif
+
+#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+
+#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                    (int)(R)); })
+
+#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                    (int)(R)); })
+#endif
+
+#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)_mm_undefined_pd(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_cvt_roundss_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
+{
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
+                                              (__v4sf)(__B),
+                                              (__v2df)(__W),
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
+{
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
+                                              (__v4sf)(__B),
+                                              (__v2df)_mm_setzero_pd(), 
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
+                                      (unsigned long long)(B), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
+                                     (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
+                                     (unsigned long long)(B), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
+                 __M);
+}
+
+#ifdef __x86_64__
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
+                 __M);
+}
+#endif
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+     int __E, int __F, int __G, int __H,
+     int __I, int __J, int __K, int __L,
+     int __M, int __N, int __O, int __P)
+{
+  return __extension__ (__m512i)(__v16si)
+  { __P, __O, __N, __M, __L, __K, __J, __I,
+    __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
+       e8,e9,e10,e11,e12,e13,e14,e15)          \
+  _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
+                   (e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi64 (long long __A, long long __B, long long __C,
+     long long __D, long long __E, long long __F,
+     long long __G, long long __H)
+{
+  return __extension__ (__m512i) (__v8di)
+  { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
+  _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_set_pd (double __A, double __B, double __C, double __D,
+        double __E, double __F, double __G, double __H)
+{
+  return __extension__ (__m512d)
+  { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
+  _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_set_ps (float __A, float __B, float __C, float __D,
+        float __E, float __F, float __G, float __H,
+        float __I, float __J, float __K, float __L,
+        float __M, float __N, float __O, float __P)
+{
+  return __extension__ (__m512)
+  { __P, __O, __N, __M, __L, __K, __J, __I,
+    __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
+  _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
+                (e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_abs_ps(__m512 __A)
+{
+  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
+{
+  return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_abs_pd(__m512d __A)
+{
+  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
+{
+  return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
+}
+
+// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
+// outputs. This class of vector operation forms the basis of many scientific
+// computations. In vector-reduction arithmetic, the evaluation off is
+// independent of the order of the input elements of V.
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+
+#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1)         \
+  __extension__({                                                      \
+    __m256##T1 Vec256 = __builtin_shufflevector(                       \
+                            (__v8d##T2)Vec512,                         \
+                            (__v8d##T2)Vec512,                         \
+                            0, 1, 2, 3)                                \
+                        Operator                                       \
+                        __builtin_shufflevector(                       \
+                            (__v8d##T2)Vec512,                         \
+                            (__v8d##T2)Vec512,                         \
+                            4, 5, 6, 7);                               \
+    __m128##T1 Vec128 = __builtin_shufflevector(                       \
+                            (__v4d##T2)Vec256,                         \
+                            (__v4d##T2)Vec256,                         \
+                            0, 1)                                      \
+                        Operator                                       \
+                        __builtin_shufflevector(                       \
+                            (__v4d##T2)Vec256,                         \
+                            (__v4d##T2)Vec256,                         \
+                            2, 3);                                     \
+    Vec128 = __builtin_shufflevector((__v2d##T2)Vec128,                \
+                                     (__v2d##T2)Vec128, 0, -1)         \
+             Operator                                                  \
+             __builtin_shufflevector((__v2d##T2)Vec128,                \
+                                     (__v2d##T2)Vec128, 1, -1);        \
+    return Vec128[0];                                                  \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, +, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, *, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, &, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, |, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
+  _mm512_reduce_operator_64bit(__W, +, f, d);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
+  _mm512_reduce_operator_64bit(__W, *, f, d);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element. 
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for packed double-precision.
+// T3 - Can be Pd for packed double or q for q-word.
+
+#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator,     \
+                                          Mask, T2, T1, T3)                    \
+  __extension__({                                                              \
+    Vec512 = __builtin_ia32_select##T3##_512(                                  \
+                 (__mmask8)Mask,                                               \
+                 (__v8d##T2)Vec512,                                            \
+                 (__v8d##T2)Vec512Neutral);                                    \
+    _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1);                    \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), 
+                                    &, __M,  i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, 
+                                    i, i, q);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, 
+                                    f, d, pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
+                                    f, d, pd);
+}
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2 - Can get 'i' for int and ' ' for packed single.
+// T1 - Can get 'i' for int and 'f' for float.
+
+#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
+    __m256##T1 Vec256 =                                                        \
+            (__m256##T1)(__builtin_shufflevector(                              \
+                                    (__v16s##T2)Vec512,                        \
+                                    (__v16s##T2)Vec512,                        \
+                                    0, 1, 2, 3, 4, 5, 6, 7)                    \
+                                Operator                                       \
+                         __builtin_shufflevector(                              \
+                                    (__v16s##T2)Vec512,                        \
+                                    (__v16s##T2)Vec512,                        \
+                                    8, 9, 10, 11, 12, 13, 14, 15));            \
+    __m128##T1 Vec128 =                                                        \
+             (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v8s##T2)Vec256,                         \
+                                    (__v8s##T2)Vec256,                         \
+                                    0, 1, 2, 3)                                \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v8s##T2)Vec256,                         \
+                                    (__v8s##T2)Vec256,                         \
+                                    4, 5, 6, 7));                              \
+    Vec128 = (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    0, 1, -1, -1)                              \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    2, 3, -1, -1));                            \
+    Vec128 = (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    0, -1, -1, -1)                             \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    1, -1, -1, -1));                           \
+    return Vec128[0];                                                          \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_add_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, +, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_mul_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, *, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_and_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, &, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_or_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, |, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_add_ps(__m512 __W) {
+  _mm512_reduce_operator_32bit(__W, +, f, );
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_ps(__m512 __W) {
+  _mm512_reduce_operator_32bit(__W, *, f, );
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element. 
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+// T3 - Can be Ps for packed single or d for d-word.
+
+#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator,     \
+                                          Mask, T2, T1, T3)                    \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                             (__mmask16)Mask,                                  \
+                             (__v16s##T2)Vec512,                               \
+                             (__v16s##T2)Vec512Neutral);                       \
+    _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1);                    \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, 
+                                    i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
+}
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+// This macro uses only intrinsics from the AVX512F feature.
+
+// Vec512 - Vector with size of 512.
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+
+#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, 2, 3, -1, -1, -1, -1),  \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 4, 5, 6, 7, -1, -1, -1, -1)); \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 2, 3, -1, -1, -1, -1, -1,     \
+                                                 -1));                         \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                0, -1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                1, -1, -1, -1, -1, -1, -1, -1))\
+                                                ;                              \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_epi64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
+(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_min_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x8000000000000000}
+//                   {max_epu,0x0000000000000000}
+//                   {max_pd, 0xFFF0000000000000}
+//                   {min_epi,0x7FFFFFFFFFFFFFFF}
+//                   {min_epu,0xFFFFFFFFFFFFFFFF}
+//                   {min_pd, 0x7FF0000000000000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                             (__mmask8)Mask,                                   \
+                             (__v8d##T2)Vec512,                                \
+                             (__v8d##T2)Vec512Neutral);                        \
+    _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2);                    \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
+                                  max_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
+                                  max_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()),
+                                  max_pd, d, f, pd, __M);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                  min_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+                                  min_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
+                                  min_pd, d, f, pd, __M);
+}
+
+// Vec512 - Vector with size 512.
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+
+#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, 4, 5, 6, 7,                      \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  8, 9, 10, 11, 12, 13, 14, 15,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  4, 5, 6, 7, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  2, 3, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0,  -1, -1, -1, -1, -1, -1, -1,              \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  1, -1, -1, -1, -1, -1, -1, -1,               \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, max_ps, , f);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, min_ps, , f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x80000000}
+//                   {max_epu,0x00000000}
+//                   {max_ps, 0xFF800000}
+//                   {min_epi,0x7FFFFFFF}
+//                   {min_epu,0xFFFFFFFF}
+//                   {min_ps, 0x7F800000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                                        (__mmask16)Mask,                       \
+                                        (__v16s##T2)Vec512,                    \
+                                        (__v16s##T2)Vec512Neutral);            \
+   _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2);                     \
+   })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f,
+                                  ps, __M);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
+                                  ps, __M);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __AVX512FINTRIN_H
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512ifmaintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512ifmaintrin.h
new file mode 100644
index 000000000..5defbaea8
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512ifmaintrin.h
@@ -0,0 +1,92 @@
+/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAINTRIN_H
+#define __IFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512ifmavlintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512ifmavlintrin.h
new file mode 100644
index 000000000..131ee5cb4
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512ifmavlintrin.h
@@ -0,0 +1,149 @@
+/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAVLINTRIN_H
+#define __IFMAVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512pfintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512pfintrin.h
new file mode 100644
index 000000000..c7fa3cf31
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512pfintrin.h
@@ -0,0 +1,111 @@
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512PFINTRIN_H
+#define __AVX512PFINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
+
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16) -1, \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
+                              (__v16si)(__m512i)(index), (int *)(addr), \
+                              (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vbmiintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vbmiintrin.h
new file mode 100644
index 000000000..837238eda
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vbmiintrin.h
@@ -0,0 +1,137 @@
+/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIINTRIN_H
+#define __VBMIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
+         __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
+              (__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
+               /* idx */ ,
+               (__v64qi) __A,
+               (__v64qi) __B,
+               (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_undefined_epi32 (),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_setzero_si512(),
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) __W,
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) __W,
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_setzero_si512 (),
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_undefined_epi32 (),
+                (__mmask64) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vbmivlintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vbmivlintrin.h
new file mode 100644
index 000000000..105c6d142
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vbmivlintrin.h
@@ -0,0 +1,247 @@
+/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIVLINTRIN_H
+#define __VBMIVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
+              (__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
+         __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
+              (__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16) -
+              1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
+               /* idx */ ,
+               (__v16qi) __A,
+               (__v16qi) __B,
+               (__mmask16)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32) -
+              1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
+               /* idx */ ,
+               (__v32qi) __A,
+               (__v32qi) __B,
+               (__mmask32)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_undefined_si128 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_setzero_si128 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) __W,
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_undefined_si256 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_setzero_si256 (),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) __W,
+                 (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi) __W,
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_setzero_si128 (),
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi) __W,
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_setzero_si256 (),
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_undefined_si256 (),
+                (__mmask32) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlbwintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlbwintrin.h
new file mode 100644
index 000000000..3b58d0433
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlbwintrin.h
@@ -0,0 +1,3172 @@
+/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBWINTRIN_H
+#define __AVX512VLBWINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw")))
+
+static  __inline __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_hi(void){
+    return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+              (__v16qi) __W,
+              (__v16qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+               (__v32qi) __W,
+               (__v32qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+               (__v8hi) __W,
+               (__v8hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+               (__v16hi) __W,
+               (__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+               (__v8hi) __I /* idx */ ,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
+         __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+               (__v16hi) __I /* idx */ ,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
+                          __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi8 (__m128i __A) {
+
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+  __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+  __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
+                                           (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
+                                           (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+                                        (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+                                        (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
+                                           (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
+                                           (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+                                       (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+                                       (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
+                                           (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
+                                           (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+                                        (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+                                        (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
+                                           (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
+                                           (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+                                       (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+                                       (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+
+#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)-1); })
+
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
+
+#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)-1); })
+
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
+
+#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)(m)); })
+
+#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)(m)); })
+
+#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
+
+#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
+
+#define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                      (__v8hi)(__m128i)(W)); })
+
+#define _mm_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                      (__v8hi)_mm_setzero_hi()); })
+
+#define _mm256_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                      (__v16hi)(__m256i)(W)); })
+
+#define _mm256_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                      (__v16hi)_mm256_setzero_si256()); })
+
+#define _mm_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                      (__v8hi)(__m128i)(W)); })
+
+#define _mm_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                      (__v8hi)_mm_setzero_hi()); })
+
+#define _mm256_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v16hi)(__m256i)(W)); })
+
+#define _mm256_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v16hi)_mm256_setzero_si256()); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi16(__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi16(__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi16(__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+                (__v8hi) __A,
+                (__v8hi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+                (__v8hi) __A,
+                (__v8hi) _mm_setzero_hi ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+                (__v16hi) __A,
+                (__v16hi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+                (__v16hi) __A,
+                (__v16hi) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+                (__v16qi) __A,
+                (__v16qi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+                (__v16qi) __A,
+                (__v16qi) _mm_setzero_hi ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+                (__v32qi) __A,
+                (__v32qi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+                (__v32qi) __A,
+                (__v32qi) _mm256_setzero_si256 ());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                 (__v16qi) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                 (__v16qi)
+                 _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                 (__v32qi) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                 (__v32qi)
+                 _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+                 (__v8hi) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+                 (__v8hi)
+                 _mm_setzero_hi (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+                 (__v16hi) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+                 (__v16hi)
+                 _mm256_setzero_si256 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+                 (__v16qi) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+                 (__v16qi)
+                 _mm_setzero_si128 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+                 (__v32qi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+                 (__v32qi)
+                 _mm256_setzero_si256 (),
+                 (__mmask32) __U);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
+             (__v8hi) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
+{
+  __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
+             (__v16hi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
+{
+  __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
+             (__v16qi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
+{
+  __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
+             (__v32qi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_test_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_test_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_test_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_testn_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_movepi8_mask (__m128i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_movepi8_mask (__m256i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi16_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_movepi16_mask (__m256i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi8 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi8 (__mmask32 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi16 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi16 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_broadcastb_epi8(__A),
+                                             (__v16qi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_broadcastb_epi8(__A),
+                                             (__v16qi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__M,
+                                             (__v32qi) _mm256_broadcastb_epi8(__A),
+                                             (__v32qi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__M,
+                                             (__v32qi) _mm256_broadcastb_epi8(__A),
+                                             (__v32qi) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__M,
+                                             (__v8hi) _mm_broadcastw_epi16(__A),
+                                             (__v8hi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__M,
+                                             (__v8hi) _mm_broadcastw_epi16(__A),
+                                             (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__M,
+                                             (__v16hi) _mm256_broadcastw_epi16(__A),
+                                             (__v16hi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__M,
+                                             (__v16hi) _mm256_broadcastw_epi16(__A),
+                                             (__v16hi) _mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_undefined_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_undefined_si256 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) __W,
+                 (__mmask16) __M);
+}
+
+#define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)(__m128i)(W)); })
+
+#define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)_mm_setzero_si128()); })
+
+#define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)(__m256i)(W)); })
+
+#define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)_mm256_setzero_si256()); })
+
+#define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)_mm_setzero_hi(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)(__m128i)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)_mm_setzero_si128(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_dbsad_epu8(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)_mm256_setzero_si256(), \
+                                           (__mmask16)-1); })
+
+#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)(__m256i)(W), \
+                                           (__mmask16)(U)); })
+
+#define _mm256_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)_mm256_setzero_si256(), \
+                                           (__mmask16)(U)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLBWINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlcdintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlcdintrin.h
new file mode 100644
index 000000000..7b02e2e1f
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlcdintrin.h
@@ -0,0 +1,263 @@
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLCDINTRIN_H
+#define __AVX512VLCDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di)
+               _mm_setzero_di (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di)  _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLCDINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vldqintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vldqintrin.h
new file mode 100644
index 000000000..cd9da4370
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vldqintrin.h
@@ -0,0 +1,1200 @@
+/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLDQINTRIN_H
+#define __AVX512VLDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq")))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) ((__v4du) __A * (__v4du) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) ((__v2du) __A * (__v2du) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+#define _mm_range_pd(A, B, C) __extension__ ({                         \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)_mm_setzero_pd(), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({          \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)(__m128d)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm_maskz_range_pd(U, A, B, C) __extension__ ({              \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)_mm_setzero_pd(), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_range_pd(A, B, C) __extension__ ({                      \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)_mm256_setzero_pd(), \
+                                          (__mmask8)-1); })
+
+#define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({       \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)(__m256d)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({           \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)_mm256_setzero_pd(), \
+                                          (__mmask8)(U)); })
+
+#define _mm_range_ps(A, B, C) __extension__ ({                         \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)_mm_setzero_ps(), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({          \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)(__m128)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_range_ps(U, A, B, C) __extension__ ({              \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)_mm_setzero_ps(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_range_ps(A, B, C) __extension__ ({                      \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)_mm256_setzero_ps(), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({       \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)(__m256)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({           \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)_mm256_setzero_ps(), \
+                                         (__mmask8)(U)); })
+
+#define _mm_reduce_pd(A, B) __extension__ ({                \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_reduce_pd(U, A, B) __extension__ ({     \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_reduce_pd(A, B) __extension__ ({                \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)-1); })
+
+#define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)(__m256d)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({     \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)(U)); })
+
+#define _mm_reduce_ps(A, B) __extension__ ({                   \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({    \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)(__m128)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm_maskz_reduce_ps(U, A, B) __extension__ ({        \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_reduce_ps(A, B) __extension__ ({                \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1); })
+
+#define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)(__m256)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({     \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U)); })
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi32_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi32_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi32 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi32 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi64_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi64_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x2 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_f64x2 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df)_mm256_undefined_pd(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) __O,
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) _mm256_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcast_i32x2 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si)_mm_undefined_si128(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i64x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di)_mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) _mm256_setzero_si256 (),
+                 __M);
+}
+
+#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A),           \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((imm) & 1) ? 2 : 0,           \
+                                   ((imm) & 1) ? 3 : 1); })
+
+#define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)(W)); })
+
+#define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A),             \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((imm) & 1) ? 2 : 0,              \
+                                   ((imm) & 1) ? 3 : 1); })
+
+#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)(W)); })
+
+#define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)_mm_setzero_di()); })
+
+#define _mm256_insertf64x2(A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(A), \
+                                 (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
+                                 ((imm) & 0x1) ? 0 : 4, \
+                                 ((imm) & 0x1) ? 1 : 5, \
+                                 ((imm) & 0x1) ? 4 : 2, \
+                                 ((imm) & 0x1) ? 5 : 3); })
+
+#define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)(W)); })
+
+#define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)_mm256_setzero_pd()); })
+
+#define _mm256_inserti64x2(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(A), \
+                                 (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
+                                 ((imm) & 0x1) ? 0 : 4, \
+                                 ((imm) & 0x1) ? 1 : 5, \
+                                 ((imm) & 0x1) ? 4 : 2, \
+                                 ((imm) & 0x1) ? 5 : 3); })
+
+#define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)(W)); })
+
+#define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)_mm256_setzero_si256()); })
+
+#define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlintrin.h
new file mode 100644
index 000000000..f3744da6a
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avx512vlintrin.h
@@ -0,0 +1,8932 @@
+/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLINTRIN_H
+#define __AVX512VLINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
+
+/* Doesn't require avx512vl, used in avx512dqintrin.h */
+static  __inline __m128i __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+_mm_setzero_di(void) {
+  return (__m128i)(__v2di){ 0LL, 0LL};
+}
+
+/* Integer compare */
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                __u);
+}
+
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_and_si256(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_and_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                          (__v8si)_mm256_andnot_si256(__A, __B),
+                                          (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_andnot_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_or_si256(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_or_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_xor_si256(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_xor_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_and_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_and_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                          (__v4di)_mm256_andnot_si256(__A, __B),
+                                          (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_andnot_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_or_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_or_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_xor_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_xor_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                         (__v2df)(__m128d)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                         (__v2df)(__m128d)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    -(__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    -(__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   -(__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       -(__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        -(__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       -(__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        -(__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      -(__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       -(__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
+                         __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      -(__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       -(__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                (__v4si) __W,
+                (__v4si) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                (__v8si) __W,
+                (__v8si) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+                 (__v2df) __W,
+                 (__v2df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+                 (__v4df) __W,
+                 (__v4df) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+                (__v4sf) __W,
+                (__v4sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+                (__v8sf) __W,
+                (__v8sf) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                (__v2di) __W,
+                (__v2di) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                (__v4di) __W,
+                (__v4di) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df)
+                  _mm_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df)
+                  _mm256_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf)
+                 _mm256_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
+  __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+            (__v2df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
+  __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+            (__v4df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+            (__v2di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+            (__v4di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
+  __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+            (__v4sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
+  __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+            (__v8sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+            (__v4si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+            (__v8si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu32_pd (__m128i __A) {
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_pd (__m128i __A) {
+  return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu32_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_ps (__m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+              (__v2df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+              (__v4df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+               (__v2di)
+               _mm_setzero_si128 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+               (__v4di)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+                   (__v4sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+                   (__v8sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+               (__v4si)
+               _mm_setzero_si128 (),
+               (__mmask8)     __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_pd (__m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_getexp_pd (__m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ps (__m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_getexp_ps (__m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi64 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi64 (__m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+#define _mm_roundscale_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1); })
+
+
+#define _mm_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U)); })
+
+
+#define _mm_maskz_roundscale_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U)); })
+
+
+#define _mm256_roundscale_pd(A, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)-1); })
+
+
+#define _mm256_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)(__m256d)(W), \
+                                              (__mmask8)(U)); })
+
+
+#define _mm256_maskz_roundscale_pd(U, A, imm)  __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)(U)); })
+
+#define _mm_roundscale_ps(A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1); })
+
+
+#define _mm_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U)); })
+
+
+#define _mm_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_roundscale_ps(A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)(__m256)(W), \
+                                             (__mmask8)(U)); })
+
+
+#define _mm256_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)(U)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_pd (__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_scalef_pd (__m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ps (__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_scalef_ps (__m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({  \
+  __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_i32scatter_pd(addr, index, v1, scale) __extension__ ({      \
+  __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({        \
+  __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i32scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8si)(__m256i)(v1), (int)(scale)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sqrt_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sqrt_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sqrt_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sqrt_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sqrt_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sqrt_ps(__A),
+                                             (__v4sf)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sqrt_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sqrt_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sub_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sub_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sub_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sub_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sub_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sub_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sub_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sub_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
+              (__v2di) __I
+              /* idx */ ,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
+              (__v4di) __I
+              /* idx */ ,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
+                   (__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
+                   (__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
+               /* idx */ ,
+               (__v2df) __A,
+               (__v2df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
+               /* idx */ ,
+               (__v4df) __A,
+               (__v4df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
+        __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4sf) __A,
+              (__v4sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8sf) __A,
+              (__v8sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
+              /* idx */ ,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8)
+              __U);
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
+              /* idx */ ,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi8_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi8_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi8_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi8_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi8_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi8_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi8_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi8_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi32_epi64(__X),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi32_epi64(__X),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi32_epi64(__X),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi32_epi64(__X),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi16_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi16_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi16_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi16_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi16_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi16_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi16_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi16_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu8_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu8_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu8_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu8_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu8_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu8_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu8_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu8_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu32_epi64(__X),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu32_epi64(__X),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu32_epi64(__X),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu32_epi64(__X),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu16_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu16_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu16_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu16_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu16_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu16_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu16_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu16_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+
+#define _mm_rol_epi32(a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_rol_epi32(w, u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)(__m128i)(w), (__mmask8)(u)); })
+
+#define _mm_maskz_rol_epi32(u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)(u)); })
+
+#define _mm256_rol_epi32(a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_rol_epi32(w, u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)(__m256i)(w), (__mmask8)(u)); })
+
+#define _mm256_maskz_rol_epi32(u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)(u)); })
+
+#define _mm_rol_epi64(a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_rol_epi64(w, u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)(__m128i)(w), (__mmask8)(u)); })
+
+#define _mm_maskz_rol_epi64(u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)(u)); })
+
+#define _mm256_rol_epi64(a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_rol_epi64(w, u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)(__m256i)(w), (__mmask8)(u)); })
+
+#define _mm256_maskz_rol_epi64(u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)(u)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rolv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rolv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rolv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rolv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+#define _mm_ror_epi32(A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)(__m128i)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)(U)); })
+
+#define _mm256_ror_epi32(A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)(__m256i)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)(U)); })
+
+#define _mm_ror_epi64(A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)(__m128i)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)(U)); })
+
+#define _mm256_ror_epi64(A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)(__m256i)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rorv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rorv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rorv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rorv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                 (__v4si) __A,
+                 (__v4si) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                 (__v4si) __A,
+                 (__v4si) _mm_setzero_si128 ());
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                 (__v8si) __A,
+                 (__v8si) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                 (__v8si) __A,
+                 (__v8si) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
+          (__v4si) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
+          (__v8si) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                 (__v2di) __A,
+                 (__v2di) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                 (__v2di) __A,
+                 (__v2di) _mm_setzero_di ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                 (__v4di) __A,
+                 (__v4di) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                 (__v4di) __A,
+                 (__v4di) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
+          (__v2di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
+          (__v4di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_movedup_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_movedup_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_movedup_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_movedup_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+
+#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \
+  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
+                                                  (__v4si)(__m128i)(O), \
+                                                  (__mmask8)(M)); })
+
+#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \
+  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
+                                                  (__v4si)_mm_setzero_si128(), \
+                                                  (__mmask8)(M)); })
+
+#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \
+  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
+                                                  (__v8si)(__m256i)(O), \
+                                                  (__mmask8)(M)); })
+
+#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \
+  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
+                                                  (__v8si)_mm256_setzero_si256(), \
+                                                  (__mmask8)(M)); })
+
+#ifdef __x86_64__
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 __M);
+}
+#endif
+
+#define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2di)(__m128i)(C), \
+                                              (int)(imm), (__mmask8)(U)); })
+
+#define _mm256_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                             (__v4df)(__m256d)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                             (__v4df)(__m256d)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (__v4di)(__m256i)(C), \
+                                              (int)(imm), (__mmask8)(U)); })
+
+#define _mm_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+                                            (__v8sf)(__m256)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+                                            (__v8sf)(__m256)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+               (__v2df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+               (__v4df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+              (__v4sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+              (__v8sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+               (__v2df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+               (__v4df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+              (__v4sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+              (__v8sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeapd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeapd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
+             (__v2di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
+             (__v4di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
+             (__v4si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
+             (__v8si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeupd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeupd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeups128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeups256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpackhi_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpackhi_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
+                                           (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
+                                           (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
+                                           (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
+                                           (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpacklo_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpacklo_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
+                                           (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
+                                           (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
+                                           (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
+                                           (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rcp14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_permute_pd((X), (C)), \
+                                       (__v2df)(__m128d)(W)); })
+
+#define _mm_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_permute_pd((X), (C)), \
+                                       (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permute_pd((X), (C)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permute_pd((X), (C)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_permute_ps((X), (C)), \
+                                      (__v4sf)(__m128)(W)); })
+
+#define _mm_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_permute_ps((X), (C)), \
+                                      (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_permute_ps((X), (C)), \
+                                      (__v8sf)(__m256)(W)); })
+
+#define _mm256_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_permute_ps((X), (C)), \
+                                      (__v8sf)_mm256_setzero_ps()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B, __U);
+}
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
+                                           (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
+                                           (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
+                                        (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
+                                        (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
+                                           (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
+                                           (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
+                                        (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
+                                        (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi64(__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi64(__m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi64(__m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+#define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+                                            (__v4si)(__m128i)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+                                            (__v4si)(__m128i)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
+                                             (__v4si)(__m128i)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+                                            (__v8si)(__m256i)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+                                            (__v8si)(__m256i)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
+                                             (__v8si)(__m256i)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+                                            (__v2di)(__m128i)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+                                            (__v2di)(__m128i)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
+                                             (__v2di)(__m128i)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+                                            (__v4di)(__m256i)(B), \
+                                            (__v4di)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+                                            (__v4di)(__m256i)(B), \
+                                            (__v4di)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
+                                             (__v4di)(__m256i)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+
+
+#define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)(__m256)(W), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)(__m256d)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)_mm256_setzero_si256(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)(__m256i)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)_mm256_setzero_si256(), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)_mm256_setzero_si256(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)(__m256i)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)_mm256_setzero_si256(), \
+                                              (__mmask8)(U)); })
+
+#define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                       (__v2df)(__m128d)(W)); })
+
+#define _mm_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                       (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                      (__v4sf)(__m128)(W)); })
+
+#define _mm_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                      (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                      (__v8sf)(__m256)(W)); })
+
+#define _mm256_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                      (__v8sf)_mm256_setzero_ps()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x4 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x4 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)
+                 __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+                 __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256(__M,
+                                              (__v4df) _mm256_broadcastsd_pd(__A),
+                                              (__v4df) __O);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256(__M,
+                                              (__v4df) _mm256_broadcastsd_pd(__A),
+                                              (__v4df) _mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128(__M,
+                                             (__v4sf) _mm_broadcastss_ps(__A),
+                                             (__v4sf) __O);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128(__M,
+                                             (__v4sf) _mm_broadcastss_ps(__A),
+                                             (__v4sf) _mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256(__M,
+                                             (__v8sf) _mm256_broadcastss_ps(__A),
+                                             (__v8sf) __O);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256(__M,
+                                             (__v8sf) _mm256_broadcastss_ps(__A),
+                                             (__v8sf) _mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__M,
+                                             (__v4si) _mm_broadcastd_epi32(__A),
+                                             (__v4si) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__M,
+                                             (__v4si) _mm_broadcastd_epi32(__A),
+                                             (__v4si) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__M,
+                                             (__v8si) _mm256_broadcastd_epi32(__A),
+                                             (__v8si) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__M,
+                                             (__v8si) _mm256_broadcastd_epi32(__A),
+                                             (__v8si) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                             (__v2di) _mm_broadcastq_epi64(__A),
+                                             (__v2di) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                             (__v2di) _mm_broadcastq_epi64(__A),
+                                             (__v2di) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                             (__v4di) _mm256_broadcastq_epi64(__A),
+                                             (__v4di) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                             (__v4di) _mm256_broadcastq_epi64(__A),
+                                             (__v4di) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)_mm_setzero_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  return __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi)_mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi)__O,
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+#define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v8sf)(__m256)(A),           \
+                                  (__v8sf)_mm256_undefined_ps(), \
+                                  ((imm) & 1) ? 4 : 0,           \
+                                  ((imm) & 1) ? 5 : 1,           \
+                                  ((imm) & 1) ? 6 : 2,           \
+                                  ((imm) & 1) ? 7 : 3); })
+
+#define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)(W)); })
+
+#define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8si)(__m256)(A),              \
+                                   (__v8si)_mm256_undefined_si256(), \
+                                   ((imm) & 1) ? 4 : 0,              \
+                                   ((imm) & 1) ? 5 : 1,              \
+                                   ((imm) & 1) ? 6 : 2,              \
+                                   ((imm) & 1) ? 7 : 3); })
+
+#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)(W)); })
+
+#define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)_mm_setzero_si128()); })
+
+#define _mm256_insertf32x4(A, B, imm) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(A), \
+                                  (__v8sf)_mm256_castps128_ps256((__m128)(B)), \
+                                  ((imm) & 0x1) ?  0 :  8, \
+                                  ((imm) & 0x1) ?  1 :  9, \
+                                  ((imm) & 0x1) ?  2 : 10, \
+                                  ((imm) & 0x1) ?  3 : 11, \
+                                  ((imm) & 0x1) ?  8 :  4, \
+                                  ((imm) & 0x1) ?  9 :  5, \
+                                  ((imm) & 0x1) ? 10 :  6, \
+                                  ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)(W)); })
+
+#define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)_mm256_setzero_ps()); })
+
+#define _mm256_inserti32x4(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(A), \
+                                 (__v8si)_mm256_castsi128_si256((__m128i)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)(W)); })
+
+#define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)_mm256_setzero_si256()); })
+
+#define _mm_getmant_pd(A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_getmant_pd(W, U, A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_getmant_pd(U, A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_getmant_pd(A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)(__m256d)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)(U)); })
+
+#define _mm_getmant_ps(A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)(__m128)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_getmant_ps(A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1); })
+
+#define _mm256_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)(__m256)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U)); })
+
+#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v2di)(__m128i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v4di)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v4si)(__m128i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v8si)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_permutex_pd(X, C) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(X), \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+#define _mm256_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permutex_pd((X), (C)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_permutex_pd(U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permutex_pd((X), (C)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm256_permutex_epi64(X, C) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(X), \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+#define _mm256_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
+                                      (__v4di)(__m256i)(W)); })
+
+#define _mm256_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
+                                      (__v4di)_mm256_setzero_si256()); })
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+          __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_setzero_si256 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) __W,
+                 __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
+          __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutexvar_ps (__m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+#define _mm_alignr_epi32(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \
+                                   (__v4si)(__m128i)(A), \
+                                   ((int)(imm) & 0x3) + 0, \
+                                   ((int)(imm) & 0x3) + 1, \
+                                   ((int)(imm) & 0x3) + 2, \
+                                   ((int)(imm) & 0x3) + 3); })
+
+#define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)(__m128i)(W)); })
+
+#define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)_mm_setzero_si128()); })
+
+#define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \
+                                   (__v8si)(__m256i)(A), \
+                                   ((int)(imm) & 0x7) + 0, \
+                                   ((int)(imm) & 0x7) + 1, \
+                                   ((int)(imm) & 0x7) + 2, \
+                                   ((int)(imm) & 0x7) + 3, \
+                                   ((int)(imm) & 0x7) + 4, \
+                                   ((int)(imm) & 0x7) + 5, \
+                                   ((int)(imm) & 0x7) + 6, \
+                                   ((int)(imm) & 0x7) + 7); })
+
+#define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)(__m256i)(W)); })
+
+#define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)_mm256_setzero_si256()); })
+
+#define _mm_alignr_epi64(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \
+                                   (__v2di)(__m128i)(A), \
+                                   ((int)(imm) & 0x1) + 0, \
+                                   ((int)(imm) & 0x1) + 1); })
+
+#define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)(__m128i)(W)); })
+
+#define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)_mm_setzero_di()); })
+
+#define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \
+                                   (__v4di)(__m256i)(A), \
+                                   ((int)(imm) & 0x3) + 0, \
+                                   ((int)(imm) & 0x3) + 1, \
+                                   ((int)(imm) & 0x3) + 2, \
+                                   ((int)(imm) & 0x3) + 3); })
+
+#define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)(__m256i)(W)); })
+
+#define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)_mm256_setzero_si256()); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_movehdup_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_movehdup_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_movehdup_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_movehdup_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_moveldup_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_moveldup_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_moveldup_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_moveldup_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+#define _mm256_mask_shuffle_epi32(W, U, A, I) __extension__({\
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)(__m256i)(W)); })
+
+#define _mm256_maskz_shuffle_epi32(U, A, I) __extension__({\
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)_mm256_setzero_si256()); })
+
+#define _mm_mask_shuffle_epi32(W, U, A, I) __extension__({\
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)(__m128i)(W)); })
+
+#define _mm_maskz_shuffle_epi32(U, A, I) __extension__({\
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)_mm_setzero_si128()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+              (__v2df) __A,
+              (__v2df) __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+              (__v2df) __A,
+              (__v2df) _mm_setzero_pd ());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+              (__v4df) __A,
+              (__v4df) __W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+              (__v4df) __A,
+              (__v4df) _mm256_setzero_pd ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+             (__v4sf) __A,
+             (__v4sf) __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+             (__v4sf) __A,
+             (__v4sf) _mm_setzero_ps ());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+             (__v8sf) __A,
+             (__v8sf) __W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+             (__v8sf) __A,
+             (__v8sf) _mm256_setzero_ps ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                  (__v8hi) _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+#define _mm_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                         (__v8hi)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                         (__v8hi)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                      (__v8hi) __W,
+                                                      (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                      (__v8hi) _mm_setzero_si128(),
+                                                      (__mmask8) __U);
+}
+#define _mm256_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                            (__v8hi)(__m128i)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                            (__v8hi)_mm_setzero_si128(), \
+                                            (__mmask8)(U)); })
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avxintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avxintrin.h
new file mode 100644
index 000000000..be03ba346
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/avxintrin.h
@@ -0,0 +1,4894 @@
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+/* Unsigned types */
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v32qs __attribute__((__vector_size__(32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+
+/* Arithmetic */
+/// \brief Adds two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the sums of both
+///    operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a+(__v4df)__b);
+}
+
+/// \brief Adds two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the sums of both
+///    operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a+(__v8sf)__b);
+}
+
+/// \brief Subtracts two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the differences between
+///    both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a-(__v4df)__b);
+}
+
+/// \brief Subtracts two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the differences between
+///    both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a-(__v8sf)__b);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source operand.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the alternating sums
+///    and differences between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source operand.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
+///    differences between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Divides two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the divisor.
+/// \returns A 256-bit vector of [4 x double] containing the quotients of both
+///    operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a/(__v4df)__b);
+}
+
+/// \brief Divides two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the divisor.
+/// \returns A 256-bit vector of [8 x float] containing the quotients of both
+///    operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a/(__v8sf)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the maximum values
+///    between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the maximum values
+///    between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the minimum values
+///    between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the minimum values
+///    between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Multiplies two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the products of both
+///    operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a * (__v4df)__b);
+}
+
+/// \brief Multiplies two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the products of both
+///    operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a * (__v8sf)__b);
+}
+
+/// \brief Calculates the square roots of the values in a 256-bit vector of
+///    [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the square roots of the
+///    values in the operand.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sqrt_pd(__m256d __a)
+{
+  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+/// \brief Calculates the square roots of the values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the square roots of the
+///    values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+/// \brief Calculates the reciprocal square roots of the values in a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
+///    roots of the values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+/// \brief Calculates the reciprocals of the values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
+///    values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
+///    by the byte operand. The source values are rounded to integer values and
+///    returned as 64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_round_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
+/// \returns A 256-bit vector of [4 x double] containing the rounded values.
+#define _mm256_round_pd(V, M) __extension__ ({ \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
+
+/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
+///    specified by the byte operand. The source values are rounded to integer
+///    values and returned as floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_round_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
+/// \returns A 256-bit vector of [8 x float] containing the rounded values.
+#define _mm256_round_ps(V, M) __extension__ ({ \
+  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
+
+/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
+///    source values are rounded up to integer values and returned as 64-bit
+///    double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_ceil_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
+#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+
+/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
+///    The source values are rounded down to integer values and returned as
+///    64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_floor_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded down
+///    values.
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+
+/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
+///    source values are rounded up to integer values and returned as
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_ceil_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
+#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+
+/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
+///    source values are rounded down to integer values and returned as
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_floor_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4du)__a & (__v4du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+///    values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8su)__a & (__v8su)__b);
+}
+
+/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values of the second operand and the one's complement of the first
+///    operand.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)(~(__v4du)__a & (__v4du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+///    values of the second operand and the one's complement of the first
+///    operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)(~(__v8su)__a & (__v8su)__b);
+}
+
+/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
+///    values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4du)__a | (__v4du)__b);
+}
+
+/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
+///    values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8su)__a | (__v8su)__b);
+}
+
+/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
+///    values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4du)__a ^ (__v4du)__b);
+}
+
+/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
+///    values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8su)__a ^ (__v8su)__b);
+}
+
+/* Horizontal arithmetic */
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal sums of the values are returned in the even-indexed
+///    elements of a vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal sums of the values are returned in the odd-indexed
+///    elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
+///    both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal sums of the values are returned in the elements with
+///    index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal sums of the values are returned in the elements with
+///    index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
+///    both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    even-indexed elements of a vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    odd-indexed elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal
+///    differences of both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal
+///    differences of both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+///    by the 128-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
+///    by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [129]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [193]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
+///    returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
+///    specified by the 128-bit integer vector operand.
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
+///    specified by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [129:128]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [161:160]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [193:192]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [225:224]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
+}
+
+/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+///    by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_permute_pd(__m128d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param A
+///    A 128-bit vector of [2 x double].
+/// \param C
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+#define _mm_permute_pd(A, C) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
+                                   (__v2df)_mm_undefined_pd(), \
+                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
+
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
+///    the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute_pd(__m256d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param A
+///    A 256-bit vector of [4 x double].
+/// \param C
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [2]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [3]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
+///         returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_permute_pd(A, C) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1)); })
+
+/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
+///    the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_permute_ps(__m128 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param A
+///    A 128-bit vector of [4 x float].
+/// \param C
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+#define _mm_permute_ps(A, C) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
+                                  (__v4sf)_mm_undefined_ps(), \
+                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
+///    the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute_ps(__m256 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param A
+///    A 256-bit vector of [8 x float].
+/// \param C
+///    An immediate integer operand specifying how the values are to be \n
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [1:0]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_permute_ps(A, C) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
+                                  (__v8sf)_mm256_undefined_ps(), \
+                                  0 + (((C) >> 0) & 0x3), \
+                                  0 + (((C) >> 2) & 0x3), \
+                                  0 + (((C) >> 4) & 0x3), \
+                                  0 + (((C) >> 6) & 0x3), \
+                                  4 + (((C) >> 0) & 0x3), \
+                                  4 + (((C) >> 2) & 0x3), \
+                                  4 + (((C) >> 4) & 0x3), \
+                                  4 + (((C) >> 6) & 0x3)); })
+
+/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+///    [4 x double], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double].
+/// \param V2
+///    A 256-bit vector of [4 x double.
+/// \param M
+///    An immediate integer operand specifying how the values are to be
+///    permuted. \n
+///    Bits [1:0]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///    Bits [5:4]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///          destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                           (__v4df)(__m256d)(V2), (M)); })
+
+/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+///    [8 x float], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float].
+/// \param V2
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer operand specifying how the values are to be
+///    permuted. \n
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///    destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                          (__v8sf)(__m256)(V2), (M)); })
+
+/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
+///    as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit integer vector.
+/// \param V2
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer operand specifying how the values are to be copied.
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///    destination.
+/// \returns A 256-bit integer vector containing the copied values.
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                           (__v8si)(__m256i)(V2), (M)); })
+
+/* Vector Blend */
+/// \brief Merges 64-bit double-precision data values stored in either of the
+///    two 256-bit vectors of [4 x double], as specified by the immediate
+///    integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double].
+/// \param V2
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An immediate integer operand, with mask bits [3:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 64-bit element in
+///    operand \a V2 is copied to the same position in the destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
+                                   (__v4df)(__m256d)(V2), \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+/// \brief Merges 32-bit single-precision data values stored in either of the
+///    two 256-bit vectors of [8 x float], as specified by the immediate
+///    integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float].
+/// \param V2
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer operand, with mask bits [7:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 32-bit element in
+///    operand \a V2 is copied to the same position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), \
+                                  (((M) & 0x01) ?  8 : 0), \
+                                  (((M) & 0x02) ?  9 : 1), \
+                                  (((M) & 0x04) ? 10 : 2), \
+                                  (((M) & 0x08) ? 11 : 3), \
+                                  (((M) & 0x10) ? 12 : 4), \
+                                  (((M) & 0x20) ? 13 : 5), \
+                                  (((M) & 0x40) ? 14 : 6), \
+                                  (((M) & 0x80) ? 15 : 7)); })
+
+/// \brief Merges 64-bit double-precision data values stored in either of the
+///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
+///    how the values are to be copied. The position of the mask bit corresponds
+///    to the most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 64-bit element in operand \a __a is copied to the same
+///    position in the destination. When a mask bit is 1, the corresponding
+///    64-bit element in operand \a __b is copied to the same position in the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+  return (__m256d)__builtin_ia32_blendvpd256(
+    (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+/// \brief Merges 32-bit single-precision data values stored in either of the
+///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
+///    and 31 specifying how the values are to be copied. The position of the
+///    mask bit corresponds to the most significant bit of a copied value. When
+///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
+///    copied to the same position in the destination. When a mask bit is 1, the
+///    corresponding 32-bit element in operand \a __b is copied to the same
+///    position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+  return (__m256)__builtin_ia32_blendvps256(
+    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+/// \brief Computes two dot products in parallel, using the lower and upper
+///    halves of two [8 x float] vectors as input to the two computations, and
+///    returning the two dot products in the lower and upper halves of the
+///    [8 x float] result. The immediate integer operand controls which input
+///    elements will contribute to the dot product, and where the final results
+///    are returned. In general, for each dot product, the four corresponding
+///    elements of the input vectors are multiplied; the first two and second
+///    two products are summed, then the two sums are added to form the final
+///    result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
+///
+/// \param V1
+///    A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param V2
+///    A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param M
+///    An immediate integer argument. Bits [7:4] determine which elements of
+///    the input vectors are used, with bit [4] corresponding to the lowest
+///    element and bit [7] corresponding to the highest element of each [4 x
+///    float] subvector. If a bit is set, the corresponding elements from the
+///    two input vectors are used as an input for dot product; otherwise that
+///    input is treated as zero. Bits [3:0] determine which elements of the
+///    result will receive a copy of the final dot product, with bit [0]
+///    corresponding to the lowest element and bit [3] corresponding to the
+///    highest element of each [4 x float] subvector. If a bit is set, the dot
+///    product is returned in the corresponding element; otherwise that element
+///    is set to zero. The bitmask is applied in the same way to each of the
+///    two parallel dot product computations.
+/// \returns A 256-bit vector of [8 x float] containing the two dot products.
+#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                 (__v8sf)(__m256)(V2), (M)); })
+
+/* Vector shuffle */
+/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
+///    specified by the immediate value operand. The four selected elements in
+///    each operand are copied to the destination according to the bits
+///    specified in the immediate operand. The selected elements from the first
+///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
+///    destination, and the selected elements from the second 256-bit operand
+///    are copied to bits [127:64] and bits [255:192] of the destination. For
+///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
+///    the 256-bit destination vector would contain the following values: b[7],
+///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float]. The four selected elements in this
+///    operand are copied to bits [63:0] and bits [191:128] in the destination,
+///    according to the bits specified in the immediate operand.
+/// \param b
+///    A 256-bit vector of [8 x float]. The four selected elements in this
+///    operand are copied to bits [127:64] and bits [255:192] in the
+///    destination, according to the bits specified in the immediate operand.
+/// \param mask
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from \a a and \a b \n.
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
+///    The destinations within the 256-bit destination are assigned values as
+///    follows, according to the bit value assignments described below: \n
+///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
+///    the destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
+///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
+///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
+///    11: Bits [127:96] and [255:224] are copied from the selected operand.
+/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
+#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), \
+                                  0  + (((mask) >> 0) & 0x3), \
+                                  0  + (((mask) >> 2) & 0x3), \
+                                  8  + (((mask) >> 4) & 0x3), \
+                                  8  + (((mask) >> 6) & 0x3), \
+                                  4  + (((mask) >> 0) & 0x3), \
+                                  4  + (((mask) >> 2) & 0x3), \
+                                  12 + (((mask) >> 4) & 0x3), \
+                                  12 + (((mask) >> 6) & 0x3)); })
+
+/// \brief Selects four double-precision values from the 256-bit operands of
+///    [4 x double], as specified by the immediate value operand. The selected
+///    elements from the first 256-bit operand are copied to bits [63:0] and
+///    bits [191:128] in the destination, and the selected elements from the
+///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
+///    the destination. For example, if bits [3:0] of the immediate operand
+///    contain a value of 0xF, the 256-bit destination vector would contain the
+///    following values: b[3], a[3], b[1], a[1].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double].
+/// \param b
+///    A 256-bit vector of [4 x double].
+/// \param mask
+///    An immediate value containing 8-bit values specifying which elements to
+///    copy from \a a and \a b: \n
+///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
+///    destination. \n
+///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
+#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), \
+                                   0 + (((mask) >> 0) & 0x1), \
+                                   4 + (((mask) >> 1) & 0x1), \
+                                   2 + (((mask) >> 2) & 0x1), \
+                                   6 + (((mask) >> 3) & 0x1)); })
+
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+/// \brief Compares each of the corresponding double-precision values of two
+///    128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand. Returns a [2 x double] vector consisting of
+///    two doubles corresponding to the two comparison results: zero if the
+///    comparison is false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+/// \brief Compares each of the corresponding values of two 128-bit vectors of
+///    [4 x float], using the operation specified by the immediate integer
+///    operand. Returns a [4 x float] vector consisting of four floats
+///    corresponding to the four comparison results: zero if the comparison is
+///    false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+/// \brief Compares each of the corresponding double-precision values of two
+///    256-bit vectors of [4 x double], using the operation specified by the
+///    immediate integer operand. Returns a [4 x double] vector consisting of
+///    four doubles corresponding to the four comparison results: zero if the
+///    comparison is false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double].
+/// \param b
+///    A 256-bit vector of [4 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 256-bit vector of [4 x double] containing the comparison results.
+#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), (c)); })
+
+/// \brief Compares each of the corresponding values of two 256-bit vectors of
+///    [8 x float], using the operation specified by the immediate integer
+///    operand. Returns a [8 x float] vector consisting of eight floats
+///    corresponding to the eight comparison results: zero if the comparison is
+///    false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float].
+/// \param b
+///    A 256-bit vector of [8 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 256-bit vector of [8 x float] containing the comparison results.
+#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), (c)); })
+
+/// \brief Compares each of the corresponding scalar double-precision values of
+///    two 128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand. If the result is true, all 64 bits of the
+///    destination vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+/// \brief Compares each of the corresponding scalar values of two 128-bit
+///    vectors of [4 x float], using the operation specified by the immediate
+///    integer operand. If the result is true, all 32 bits of the destination
+///    vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+/// \brief Takes a [8 x i32] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x i32].
+/// \param __imm
+///    An immediate integer operand with bits [2:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 32 bits of extended
+///    packed data.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi32(__m256i __a, const int __imm)
+{
+  __v8si __b = (__v8si)__a;
+  return __b[__imm & 7];
+}
+
+/// \brief Takes a [16 x i16] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [16 x i16].
+/// \param __imm
+///    An immediate integer operand with bits [3:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
+///    packed data.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi16(__m256i __a, const int __imm)
+{
+  __v16hi __b = (__v16hi)__a;
+  return (unsigned short)__b[__imm & 15];
+}
+
+/// \brief Takes a [32 x i8] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [32 x i8].
+/// \param __imm
+///    An immediate integer operand with bits [4:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
+///    packed data.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi8(__m256i __a, const int __imm)
+{
+  __v32qi __b = (__v32qi)__a;
+  return (unsigned char)__b[__imm & 31];
+}
+
+#ifdef __x86_64__
+/// \brief Takes a [4 x i64] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [4 x i64].
+/// \param __imm
+///    An immediate integer operand with bits [1:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 64-bit integer containing the extracted 64 bits of extended
+///    packed data.
+static __inline long long  __DEFAULT_FN_ATTRS
+_mm256_extract_epi64(__m256i __a, const int __imm)
+{
+  __v4di __b = (__v4di)__a;
+  return __b[__imm & 3];
+}
+#endif
+
+/// \brief Takes a [8 x i32] vector and replaces the vector element value
+///    indexed by the immediate constant operand by a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [8 x i32] to be used by the insert operation.
+/// \param __b
+///    An integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
+{
+  __v8si __c = (__v8si)__a;
+  __c[__imm & 7] = __b;
+  return (__m256i)__c;
+}
+
+
+/// \brief Takes a [16 x i16] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [16 x i16] to be used by the insert operation.
+/// \param __b
+///    An i16 integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
+{
+  __v16hi __c = (__v16hi)__a;
+  __c[__imm & 15] = __b;
+  return (__m256i)__c;
+}
+
+/// \brief Takes a [32 x i8] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [32 x i8] to be used by the insert operation.
+/// \param __b
+///    An i8 integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
+{
+  __v32qi __c = (__v32qi)__a;
+  __c[__imm & 31] = __b;
+  return (__m256i)__c;
+}
+
+#ifdef __x86_64__
+/// \brief Takes a [4 x i64] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [4 x i64] to be used by the insert operation.
+/// \param __b
+///    A 64-bit integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///     \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
+{
+  __v4di __c = (__v4di)__a;
+  __c[__imm & 3] = __b;
+  return (__m256i)__c;
+}
+#endif
+
+/* Conversion */
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_pd(__m128i __a)
+{
+  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
+}
+
+/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit vector of [8 x float] containing the converted values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_ps(__m256i __a)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+}
+
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtpd_ps(__m256d __a)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
+///    x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtps_pd(__m128 __a)
+{
+  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
+}
+
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32], truncating the result by rounding towards zero when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32]. When a conversion is inexact, the value returned is rounded
+///    according to the rounding control bits in the MXCSR register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
+///    truncating the result by rounding towards zero when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+static __inline double __DEFAULT_FN_ATTRS
+_mm256_cvtsd_f64(__m256d __a)
+{
+ return __a[0];
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_cvtsi256_si32(__m256i __a)
+{
+ __v8si __b = (__v8si)__a;
+ return __b[0];
+}
+
+static __inline float __DEFAULT_FN_ATTRS
+_mm256_cvtss_f32(__m256 __a)
+{
+ return __a[0];
+}
+
+/* Vector replicate */
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
+///    vector of [8 x float] to float values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_movehdup_ps(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
+///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_moveldup_ps(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+/// \brief Moves and duplicates double-precision floating point values from a
+///    256-bit vector of [4 x double] to double-precision values in a 256-bit
+///    vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double]. \n
+///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
+///    return value. \n
+///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
+///    the return value.
+/// \returns A 256-bit vector of [4 x double] containing the moved and
+///    duplicated values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_movedup_pd(__m256d __a)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the return value. \n
+///    Bits [255:192] are written to bits [191:128] of the return value. \n
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the return value. \n
+///    Bits [255:192] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
+}
+
+/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the return value. \n
+///    Bits [191:128] are written to bits [191:128] of the return value.
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the return value. \n
+///    Bits [191:128] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
+}
+
+/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the return value. \n
+///    Bits [127:96] are written to bits [95:64] of the return value. \n
+///    Bits [223:192] are written to bits [159:128] of the return value. \n
+///    Bits [255:224] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [63:32] of the return value. \n
+///    Bits [127:96] are written to bits [127:96] of the return value. \n
+///    Bits [223:192] are written to bits [191:160] of the return value. \n
+///    Bits [255:224] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the return value. \n
+///    Bits [63:32] are written to bits [95:64] of the return value. \n
+///    Bits [159:128] are written to bits [159:128] of the return value. \n
+///    Bits [191:160] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the return value. \n
+///    Bits [63:32] are written to bits [127:96] of the return value. \n
+///    Bits [159:128] are written to bits [191:160] of the return value. \n
+///    Bits [191:160] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the ZF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the CF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+/// \brief Extracts the sign bits of double-precision floating point elements
+///    in a 256-bit vector of [4 x double] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the double-precision
+///    floating point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0].
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_pd(__m256d __a)
+{
+  return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+/// \brief Extracts the sign bits of double-precision floating point elements
+///    in a 256-bit vector of [8 x float] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the double-precision floating
+///    point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [7:0].
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_ps(__m256 __a)
+{
+  return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+/// \brief Zeroes the contents of all XMM or YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroall(void)
+{
+  __builtin_ia32_vzeroall();
+}
+
+/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroupper(void)
+{
+  __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+/// \brief Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+/// \brief Loads a scalar double-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x double] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
+///
+/// \param __a
+///    The double-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_sd(double const *__a)
+{
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+/// \brief Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [8 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+/// \brief Loads the data from a 128-bit vector of [2 x double] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [2 x double] to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_pd(__m128d const *__a)
+{
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
+}
+
+/// \brief Loads the data from a 128-bit vector of [4 x float] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [4 x float] to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ps(__m128 const *__a)
+{
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
+}
+
+/* SIMD load ops */
+/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing
+///    double-precision floating point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_load_pd(double const *__p)
+{
+  return *(__m256d *)__p;
+}
+
+/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_load_ps(float const *__p)
+{
+  return *(__m256 *)__p;
+}
+
+/// \brief Loads 4 double-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing double-precision floating
+///    point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+/// \brief Loads 8 single-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing single-precision floating
+///    point values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
+///    location pointed to by \a __p into elements of a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
+///    values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_load_si256(__m256i const *__p)
+{
+  return *__p;
+}
+
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu_si256(__m256i const *__p)
+{
+  struct __loadu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si256*)__p)->__v;
+}
+
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
+///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
+///    line boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_lddqu_si256(__m256i const *__p)
+{
+  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+/// \brief Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to a 32-byte aligned memory location pointed to by
+///    \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    double-precision floaing point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_pd(double *__p, __m256d __a)
+{
+  *(__m256d *)__p = __a;
+}
+
+/// \brief Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_ps(float *__p, __m256 __a)
+{
+  *(__m256 *)__p = __a;
+}
+
+/// \brief Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the double-precision
+///    floating point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+  struct __storeu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pd*)__p)->__v = __a;
+}
+
+/// \brief Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+  struct __storeu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ps*)__p)->__v = __a;
+}
+
+/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
+///    aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+  *__p = __a;
+}
+
+/// \brief Stores integer values from a 256-bit integer vector to an unaligned
+///    memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_si256(__m256i *__p, __m256i __a)
+{
+  struct __storeu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si256*)__p)->__v = __a;
+}
+
+/* Conditional load ops */
+/// \brief Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [2 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_maskload_pd(double const *__p, __m128i __m)
+{
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
+}
+
+/// \brief Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [4 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element represents the mask bits. If a
+///    mask bit is zero, the corresponding value in the memory location is not
+///    loaded and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [4 x double] containing the loaded values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_maskload_pd(double const *__p, __m256i __m)
+{
+  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+                                               (__v4di)__m);
+}
+
+/// \brief Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [4 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_maskload_ps(float const *__p, __m128i __m)
+{
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
+}
+
+/// \brief Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [8 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element represents the mask bits. If a mask
+///    bit is zero, the corresponding value in the memory location is not loaded
+///    and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [8 x float] containing the loaded values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_maskload_ps(float const *__p, __m256i __m)
+{
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
+}
+
+/* Conditional store ops */
+/// \brief Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element in the mask vector represents the
+///    mask bits. If a mask bit is zero, the corresponding value from vector
+///    \a __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
+{
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
+}
+
+/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector \a __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
+{
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
+}
+
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element in the mask vector represents
+///    the mask bits. If a mask bit is zero, the corresponding value from vector
+///    __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
+{
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
+}
+
+/// \brief Moves single-precision floating point values from a 128-bit vector
+///    of [4 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
+{
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    integer values.
+/// \param __b
+///    A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
+}
+
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a 32-byte aligned memory location. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    integer values.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
+}
+
+/// \brief Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location. To minimize
+///    caching, the data is flagged as non-temporal (unlikely to be used again
+///    soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    single-precision floating point values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
+}
+
+/* Create vectors */
+/// \brief Create a 256-bit vector of [4 x double] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [4 x double] containing undefined values.
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_undefined_pd(void)
+{
+  return (__m256d)__builtin_ia32_undef256();
+}
+
+/// \brief Create a 256-bit vector of [8 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [8 x float] containing undefined values.
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_undefined_ps(void)
+{
+  return (__m256)__builtin_ia32_undef256();
+}
+
+/// \brief Create a 256-bit integer vector with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit integer vector containing undefined values.
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_undefined_si256(void)
+{
+  return (__m256i)__builtin_ia32_undef256();
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __d, __c, __b, __a };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
+///    with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+              float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+                 int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+                 short __w11, short __w10, short __w09, short __w08,
+                 short __w07, short __w06, short __w05, short __w04,
+                 short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+                char __b27, char __b26, char __b25, char __b24,
+                char __b23, char __b22, char __b21, char __b20,
+                char __b19, char __b18, char __b17, char __b16,
+                char __b15, char __b14, char __b13, char __b12,
+                char __b11, char __b10, char __b09, char __b08,
+                char __b07, char __b06, char __b05, char __b04,
+                char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+  };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __a, __b, __c, __d };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float],
+///    initialized in reverse order with the specified single-precision
+///    float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+               float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+                  int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+       short __w11, short __w10, short __w09, short __w08,
+       short __w07, short __w06, short __w05, short __w04,
+       short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
+    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+                 char __b27, char __b26, char __b25, char __b24,
+                 char __b23, char __b22, char __b21, char __b20,
+                 char __b19, char __b18, char __b17, char __b16,
+                 char __b15, char __b14, char __b13, char __b12,
+                 char __b11, char __b10, char __b09, char __b08,
+                 char __b07, char __b06, char __b05, char __b04,
+                 char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
+    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __a, __b, __c, __d };
+}
+
+/* Create vectors with repeated elements */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
+///    of the four double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set1_pd(double __w)
+{
+  return (__m256d){ __w, __w, __w, __w };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
+///    of the eight single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set1_ps(float __w)
+{
+  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
+///    32-bit integral vector elements set to the specified 32-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __i
+///    A 32-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [8 x i32].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi32(int __i)
+{
+  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+}
+
+/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
+///    16-bit integral vector elements set to the specified 16-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A 16-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [16 x i16].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi16(short __w)
+{
+  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w };
+}
+
+/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
+///    8-bit integral vector elements set to the specified 8-bit integral value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __b
+///    An 8-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [32 x i8].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi8(char __b)
+{
+  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b };
+}
+
+/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
+///    64-bit integral vector elements set to the specified 64-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __q
+///    A 64-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [4 x i64].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi64x(long long __q)
+{
+  return (__m256i)(__v4di){ __q, __q, __q, __q };
+}
+
+/* Create __zeroed vectors */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setzero_pd(void)
+{
+  return (__m256d){ 0, 0, 0, 0 };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setzero_ps(void)
+{
+  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit integer vector initialized to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setzero_si256(void)
+{
+  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+}
+
+/* Cast between vector types */
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    floating-point vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castpd_ps(__m256d __a)
+{
+  return (__m256)__a;
+}
+
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castpd_si256(__m256d __a)
+{
+  return (__m256i)__a;
+}
+
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    floating-point vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castps_pd(__m256 __a)
+{
+  return (__m256d)__a;
+}
+
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castps_si256(__m256 __a)
+{
+  return (__m256i)__a;
+}
+
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castsi256_ps(__m256i __a)
+{
+  return (__m256)__a;
+}
+
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castsi256_pd(__m256i __a)
+{
+  return (__m256d)__a;
+}
+
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [4 x double] as a 128-bit floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the
+///    lower 128 bits of the parameter.
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm256_castpd256_pd128(__m256d __a)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
+}
+
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [8 x float] as a 128-bit floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the
+///    lower 128 bits of the parameter.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_castps256_ps128(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
+}
+
+/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 128-bit integer vector containing the lower 128 bits of the
+///    parameter.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_castsi256_si128(__m256i __a)
+{
+  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The contents of the upper 128
+///    bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The contents of the upper 128 bits are
+///    undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The contents of the upper 128 bits are undefined.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
+}
+
+/*
+   Vector insert.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
+///    a 256-bit vector of [8 x float] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [4 x float] in the second parameter. The immediate
+///    integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [4 x float]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V1), \
+    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
+    (((M) & 1) ?  0 :  8), \
+    (((M) & 1) ?  1 :  9), \
+    (((M) & 1) ?  2 : 10), \
+    (((M) & 1) ?  3 : 11), \
+    (((M) & 1) ?  8 :  4), \
+    (((M) & 1) ?  9 :  5), \
+    (((M) & 1) ? 10 :  6), \
+    (((M) & 1) ? 11 :  7) );})
+
+/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
+///    a 256-bit vector of [4 x double] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [2 x double] in the second parameter. The immediate
+///    integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [2 x double]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V1), \
+    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+/// \brief Constructs a new 256-bit integer vector by first duplicating a
+///    256-bit integer vector given in the first parameter, and then replacing
+///    either the upper or the lower 128 bits with the contents of a 128-bit
+///    integer vector in the second parameter. The immediate integer parameter
+///    determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit integer vector. This vector is copied to the result first, and
+///    then either the upper or the lower 128 bits of the result will be
+///    replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit integer vector. The contents of this parameter are written to
+///    either the upper or the lower 128 bits of the result depending on the
+///     value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit integer vector containing the interleaved values.
+#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V1), \
+    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+/*
+   Vector extract.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [8 x float], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
+#define _mm256_extractf128_ps(V, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V), \
+    (__v8sf)(_mm256_undefined_ps()), \
+    (((M) & 1) ? 4 : 0), \
+    (((M) & 1) ? 5 : 1), \
+    (((M) & 1) ? 6 : 2), \
+    (((M) & 1) ? 7 : 3) );})
+
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [4 x double], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
+#define _mm256_extractf128_pd(V, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V), \
+    (__v4df)(_mm256_undefined_pd()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
+///    integer vector, as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter:  \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit integer vector containing the extracted bits.
+#define _mm256_extractf128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V), \
+    (__v4di)(_mm256_undefined_si256()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+/* SIMD load ops (unaligned) */
+/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [8 x float] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
+  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
+}
+
+/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [4 x double] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
+  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
+}
+
+/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
+///    constructs a 256-bit integer vector by concatenating the two 128-bit
+///    vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[255:128] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[127:0] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+{
+  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
+  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
+}
+
+/* SIMD store ops (unaligned) */
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [8 x float] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  _mm_storeu_ps(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  _mm_storeu_ps(__addr_hi, __v128);
+}
+
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [4 x double] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  _mm_storeu_pd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  _mm_storeu_pd(__addr_hi, __v128);
+}
+
+/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
+///    two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit integer vector.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  _mm_storeu_si128(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  _mm_storeu_si128(__addr_hi, __v128);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_m128 (__m128 __hi, __m128 __lo)
+{
+  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_m128d (__m128d __hi, __m128d __lo)
+{
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_m128i (__m128i __hi, __m128i __lo)
+{
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
+///    similar to _mm256_set_m128, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_m128 (__m128 __lo, __m128 __hi)
+{
+  return _mm256_set_m128(__hi, __lo);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
+///    similar to _mm256_set_m128d, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_m128d (__m128d __lo, __m128d __hi)
+{
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors. This is similar to _mm256_set_m128i, but the order of
+///    the input parameters is swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_m128i (__m128i __lo, __m128i __hi)
+{
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVXINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/bmi2intrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/bmi2intrin.h
new file mode 100644
index 000000000..fdae82cf2
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/bmi2intrin.h
@@ -0,0 +1,95 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __BMI2INTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/bmiintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/bmiintrin.h
new file mode 100644
index 000000000..488eb2dbd
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/bmiintrin.h
@@ -0,0 +1,548 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _tzcnt_u16(unsigned short a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param a
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _andn_u32(unsigned int a, unsigned int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param a
+///    An unsigned integer containing one of the operands.
+/// \param b
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+
+/* _bextr_u32 != __bextr_u32 */
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsi_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param a
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+#define _blsi_u32(a)      (__blsi_u32((a)))
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsmsk_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param a
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsr_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param a
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+#define _blsr_u32(a)      (__blsr_u32((a)))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _tzcnt_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param a
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+   instruction behaves as BSF on non-BMI targets, there is code that expects
+   to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned short __RELAXED_FN_ATTRS
+__tzcnt_u16(unsigned short __X)
+{
+  return __X ? __builtin_ctzs(__X) : 16;
+}
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
+///    specify the index of the least significant bit. Bits [15:8] specify the
+///    number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ int __RELAXED_FN_ATTRS
+_mm_tzcnt_32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+#ifdef __x86_64__
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param b
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+
+/* _bextr_u64 != __bextr_u64 */
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsi_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+#define _blsi_u64(a)      (__blsi_u64((a)))
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsmsk_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns A unsigned 64-bit integer containing the newly created mask.
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsr_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+#define _blsr_u64(a)      (__blsr_u64((a)))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _tzcnt_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
+///    the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///     in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns A unsigned 64-bit integer containing the newly created mask.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ long long __RELAXED_FN_ATTRS
+_mm_tzcnt_64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __RELAXED_FN_ATTRS
+
+#endif /* __BMIINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/clflushoptintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/clflushoptintrin.h
new file mode 100644
index 000000000..60e0ead76
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/clflushoptintrin.h
@@ -0,0 +1,41 @@
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLFLUSHOPTINTRIN_H
+#define __CLFLUSHOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflushopt(char * __m) {
+  __builtin_ia32_clflushopt(__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cpuid.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cpuid.h
new file mode 100644
index 000000000..400dcfacd
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cpuid.h
@@ -0,0 +1,215 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Responses identification request with %eax 0 */
+/* AMD:     "AuthenticAMD" */
+#define signature_AMD_ebx 0x68747541
+#define signature_AMD_edx 0x69746e65
+#define signature_AMD_ecx 0x444d4163
+/* CENTAUR: "CentaurHauls" */
+#define signature_CENTAUR_ebx 0x746e6543
+#define signature_CENTAUR_edx 0x48727561
+#define signature_CENTAUR_ecx 0x736c7561
+/* CYRIX:   "CyrixInstead" */
+#define signature_CYRIX_ebx 0x69727943
+#define signature_CYRIX_edx 0x736e4978
+#define signature_CYRIX_ecx 0x64616574
+/* INTEL:   "GenuineIntel" */
+#define signature_INTEL_ebx 0x756e6547
+#define signature_INTEL_edx 0x49656e69
+#define signature_INTEL_ecx 0x6c65746e
+/* TM1:     "TransmetaCPU" */
+#define signature_TM1_ebx 0x6e617254
+#define signature_TM1_edx 0x74656d73
+#define signature_TM1_ecx 0x55504361
+/* TM2:     "GenuineTMx86" */
+#define signature_TM2_ebx 0x756e6547
+#define signature_TM2_edx 0x54656e69
+#define signature_TM2_ecx 0x3638784d
+/* NSC:     "Geode by NSC" */
+#define signature_NSC_ebx 0x646f6547
+#define signature_NSC_edx 0x43534e20
+#define signature_NSC_ecx 0x79622065
+/* NEXGEN:  "NexGenDriven" */
+#define signature_NEXGEN_ebx 0x4778654e
+#define signature_NEXGEN_edx 0x72446e65
+#define signature_NEXGEN_ecx 0x6e657669
+/* RISE:    "RiseRiseRise" */
+#define signature_RISE_ebx 0x65736952
+#define signature_RISE_edx 0x65736952
+#define signature_RISE_ecx 0x65736952
+/* SIS:     "SiS SiS SiS " */
+#define signature_SIS_ebx 0x20536953
+#define signature_SIS_edx 0x20536953
+#define signature_SIS_ecx 0x20536953
+/* UMC:     "UMC UMC UMC " */
+#define signature_UMC_ebx 0x20434d55
+#define signature_UMC_edx 0x20434d55
+#define signature_UMC_ecx 0x20434d55
+/* VIA:     "VIA VIA VIA " */
+#define signature_VIA_ebx 0x20414956
+#define signature_VIA_edx 0x20414956
+#define signature_VIA_ecx 0x20414956
+/* VORTEX:  "Vortex86 SoC" */
+#define signature_VORTEX_ebx 0x74726f56
+#define signature_VORTEX_edx 0x36387865
+#define signature_VORTEX_ecx 0x436f5320
+
+/* Features in %ecx for level 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE4_1      bit_SSE41       /* for gcc compat */
+#define bit_SSE42       0x00100000
+#define bit_SSE4_2      bit_SSE42       /* for gcc compat */
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_AES         bit_AESNI       /* for gcc compat */
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_F16C        0x20000000
+#define bit_RDRND       0x40000000
+
+/* Features in %edx for level 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_CMPXCHG8B   bit_CX8         /* for gcc compat */
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR        /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for level 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SMEP        0x00000080
+#define bit_ENH_MOVSB   0x00000200
+
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level), "2"(__count))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/algorithm b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/algorithm
new file mode 100644
index 000000000..95d9beb73
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/algorithm
@@ -0,0 +1,96 @@
+/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
+#define __CLANG_CUDA_WRAPPERS_ALGORITHM
+
+// This header defines __device__ overloads of std::min/max, but only if we're
+// <= C++11.  In C++14, these functions are constexpr, and so are implicitly
+// __host__ __device__.
+//
+// We don't support the initializer_list overloads because
+// initializer_list::begin() and end() are not __host__ __device__ functions.
+//
+// When compiling in C++14 mode, we could force std::min/max to have different
+// implementations for host and device, by declaring the device overloads
+// before the constexpr overloads appear.  We choose not to do this because
+
+//  a) why write our own implementation when we can use one from the standard
+//     library? and
+//  b) libstdc++ is evil and declares min/max inside a header that is included
+//     *before* we include <algorithm>.  So we'd have to unconditionally
+//     declare our __device__ overloads of min/max, but that would pollute
+//     things for people who choose not to include <algorithm>.
+
+#include_next <algorithm>
+
+#if __cplusplus <= 201103L
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__a, __b) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__b, __a) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif // __cplusplus <= 201103L
+#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/complex b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/complex
new file mode 100644
index 000000000..11d40a82a
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/complex
@@ -0,0 +1,82 @@
+/*===---- complex - CUDA wrapper for <complex> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
+#define __CLANG_CUDA_WRAPPERS_COMPLEX
+
+// Wrapper around <complex> that forces its functions to be __host__
+// __device__.
+
+// First, include host-only headers we think are likely to be included by
+// <complex>, so that the pragma below only applies to <complex> itself.
+#if __cplusplus >= 201103L
+#include <type_traits>
+#endif
+#include <stdexcept>
+#include <cmath>
+#include <sstream>
+
+// Next, include our <algorithm> wrapper, to ensure that device overloads of
+// std::min/max are available.
+#include <algorithm>
+
+#pragma clang force_cuda_host_device begin
+
+// When compiling for device, ask libstdc++ to use its own implements of
+// complex functions, rather than calling builtins (which resolve to library
+// functions that don't exist when compiling CUDA device code).
+//
+// This is a little dicey, because it causes libstdc++ to define a different
+// set of overloads on host and device.
+//
+//   // Present only when compiling for host.
+//   __host__ __device__ void complex<float> sin(const complex<float>& x) {
+//     return __builtin_csinf(x);
+//   }
+//
+//   // Present when compiling for host and for device.
+//   template <typename T>
+//   void __host__ __device__ complex<T> sin(const complex<T>& x) {
+//     return complex<T>(sin(x.real()) * cosh(x.imag()),
+//                       cos(x.real()), sinh(x.imag()));
+//   }
+//
+// This is safe because when compiling for device, all function calls in
+// __host__ code to sin() will still resolve to *something*, even if they don't
+// resolve to the same function as they resolve to when compiling for host.  We
+// don't care that they don't resolve to the right function because we won't
+// codegen this host code when compiling for device.
+
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#define _GLIBCXX_USE_C99_COMPLEX 0
+#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
+
+#include_next <complex>
+
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
+
+#pragma clang force_cuda_host_device end
+
+#endif // include guard
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/new b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/new
new file mode 100644
index 000000000..b77131af0
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/cuda_wrappers/new
@@ -0,0 +1,47 @@
+/*===---- complex - CUDA wrapper for <new> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_NEW
+#define __CLANG_CUDA_WRAPPERS_NEW
+
+#include_next <new>
+
+// Device overrides for placement new and delete.
+#pragma push_macro("CUDA_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define CUDA_NOEXCEPT noexcept
+#else
+#define CUDA_NOEXCEPT
+#endif
+
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+#pragma pop_macro("CUDA_NOEXCEPT")
+
+#endif // include guard
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/emmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/emmintrin.h
new file mode 100644
index 000000000..1512f9f0b
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/emmintrin.h
@@ -0,0 +1,4791 @@
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+/* Type defines.  */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+/* Unsigned types */
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
+typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v16qs __attribute__((__vector_size__(16)));
+
+#include <f16cintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+
+/// \brief Adds lower double-precision values in both operands and returns the
+///    sum in the lower 64 bits of the result. The upper 64 bits of the result
+///    are copied from the upper double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
+///    from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+/// \brief Adds two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the sums of both
+///    operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a + (__v2df)__b);
+}
+
+/// \brief Subtracts the lower double-precision value of the second operand
+///    from the lower double-precision value of the first operand and returns
+///    the difference in the lower 64 bits of the result. The upper 64 bits of
+///    the result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    difference of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+/// \brief Subtracts two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the differences between
+///    both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a - (__v2df)__b);
+}
+
+/// \brief Multiplies lower double-precision values in both operands and returns
+///    the product in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    product of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+/// \brief Multiplies two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the products of both
+///    operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a * (__v2df)__b);
+}
+
+/// \brief Divides the lower double-precision value of the first operand by the
+///    lower double-precision value of the second operand and returns the
+///    quotient in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing divisor.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    quotient of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+/// \brief Performs an element-by-element division of two 128-bit vectors of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the divisor.
+/// \returns A 128-bit vector of [2 x double] containing the quotients of both
+///    operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a / (__v2df)__b);
+}
+
+/// \brief Calculates the square root of the lower double-precision value of
+///    the second operand and returns it in the lower 64 bits of the result.
+///    The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    upper 64 bits of this operand are copied to the upper 64 bits of the
+///    result.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    square root is calculated using the lower 64 bits of this operand.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    square root of the lower 64 bits of operand \a __b, and whose upper 64
+///    bits are copied from the upper 64 bits of operand \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Calculates the square root of the each of two values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the square roots of the
+///    values in the operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_pd(__m128d __a)
+{
+  return __builtin_ia32_sqrtpd((__v2df)__a);
+}
+
+/// \brief Compares lower 64-bit double-precision values of both operands, and
+///    returns the lesser of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    minimum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the lesser of each pair of
+///    values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the minimum values
+///    between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares lower 64-bits double-precision values of both operands, and
+///    returns the greater of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    maximum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the greater of each pair
+///    of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the maximum values
+///    between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2du)__a & (__v2du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values in the second operand and the one's complement of the first
+///    operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)(~(__v2du)__a & (__v2du)__b);
+}
+
+/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
+///    values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2du)__a | (__v2du)__b);
+}
+
+/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
+///    values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2du)__a ^ (__v2du)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] for equality. Each comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand. A pair
+///    of double-precision values are "ordered" with respect to each other if
+///    neither value is a NaN. Each comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand. A pair
+///    of double-precision values are "unordered" with respect to each other if
+///    one or both values are NaN. Each comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unequal to those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "ordered" with respect to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
+///    "ordered" with respect to each other if neither value is a NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "unordered" with respect to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
+///    are "unordered" with respect to each other if one or both values are NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than the corresponding
+///    value in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than the corresponding
+///    value in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true. If
+///    either of the two lower double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true. If either of the two lower double-precision values is
+///    NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0 for false, 1 for true.
+///    If either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.  If either of the two lower double-precision values
+///    is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true. If
+///    either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison result. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two single-precision floating-point
+///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
+///    The upper 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpd_ps(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2ps((__v2df)__a);
+}
+
+/// \brief Converts the lower two single-precision floating-point elements of a
+///    128-bit vector of [4 x float] into two double-precision floating-point
+///    values, returned in a 128-bit vector of [2 x double]. The upper two
+///    elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower two single-precision
+///    floating-point elements are converted to double-precision values. The
+///    upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtps_pd(__m128 __a)
+{
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
+}
+
+/// \brief Converts the lower two integer elements of a 128-bit vector of
+///    [4 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double]. The upper two elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
+///    converted to double-precision values. The upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi32_pd(__m128i __a)
+{
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
+///    64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi32(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2dq((__v2df)__a);
+}
+
+/// \brief Converts the low-order element of a 128-bit vector of [2 x double]
+///    into a 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si((__v2df)__a);
+}
+
+/// \brief Converts the lower double-precision floating-point element of a
+///    128-bit vector of [2 x double], in the second parameter, into a
+///    single-precision floating-point value, returned in the lower 32 bits of a
+///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
+///    copied from the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
+///    copied to the upper 96 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
+///    converted value from the second parameter. The upper 96 bits are copied
+///    from the upper 96 bits of the first parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
+}
+
+/// \brief Converts a 32-bit signed integer value, in the second parameter, into
+///    a double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 32-bit signed integer containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+/// \brief Converts the lower single-precision floating-point element of a
+///    128-bit vector of [4 x float], in the second parameter, into a
+///    double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower single-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
+///    result of either conversion is inexact, the result is truncated (rounded
+///    towards zero) regardless of the current MXCSR setting. The upper 64 bits
+///    of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi32(__m128d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
+}
+
+/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
+///    signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvttsd2si((__v2df)__a);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32]. If the result of either
+///    conversion is inexact, the result is truncated (rounded towards zero)
+///    regardless of the current MXCSR setting.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+}
+
+/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
+///    [2 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtpi32_pd(__m64 __a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
+///    a double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
+/// \returns A double-precision floating-point value copied from the lower 64
+///    bits of \a __a.
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm_cvtsd_f64(__m128d __a)
+{
+  return __a[0];
+}
+
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_pd(double const *__dp)
+{
+  return *(__m128d*)__dp;
+}
+
+/// \brief Loads a double-precision floating-point value from a specified memory
+///    location and duplicates it to both vector elements of a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing a double-precision value.
+/// \returns A 128-bit vector of [2 x double] containing the loaded and
+///    duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load1_pd(double const *__dp)
+{
+  struct __mm_load1_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __u };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+/// \brief Loads two double-precision values, in reverse order, from an aligned
+///    memory location into a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
+/// needed shuffling instructions. In AVX mode, the shuffling may be combined
+/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
+///
+/// \param __dp
+///    A 16-byte aligned pointer to an array of double-precision values to be
+///    loaded in reverse order.
+/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
+///    values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadr_pd(double const *__dp)
+{
+  __m128d __u = *(__m128d*)__dp;
+  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
+}
+
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadu_pd(double const *__dp)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__dp)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si64(void const *__a)
+{
+  struct __loadu_si64 {
+    long long __v;
+  } __attribute__((__packed__, __may_alias__));
+  long long __u = ((struct __loadu_si64*)__a)->__v;
+  return (__m128i){__u, 0L};
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_sd(double const *__dp)
+{
+  struct __mm_load_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  return (__m128d){ __u, 0 };
+}
+
+/// \brief Loads a double-precision value into the high-order bits of a 128-bit
+///    vector of [2 x double]. The low-order bits are copied from the low-order
+///    bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [127:64] of the result. The address of the memory location does not have
+///    to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  return (__m128d){ __a[0], __u };
+}
+
+/// \brief Loads a double-precision value into the low-order bits of a 128-bit
+///    vector of [2 x double]. The high-order bits are copied from the
+///    high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [63:0] of the result. The address of the memory location does not have to
+///    be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadl_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __a[1] };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] with
+///    unspecified content. This could be used as an argument to another
+///    intrinsic function where the argument is required but the value is not
+///    actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
+///    content.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_undefined_pd(void)
+{
+  return (__m128d)__builtin_ia32_undef128();
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits of the vector are initialized with the specified double-precision
+///    floating-point value. The upper 64 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
+///    lower 64 bits contain the value of the parameter. The upper 64 bits are
+///    set to zero.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_sd(double __w)
+{
+  return (__m128d){ __w, 0 };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
+///    of the two double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set1_pd(double __w)
+{
+  return (__m128d){ __w, __w };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd(double __w, double __x)
+{
+  return (__m128d){ __x, __w };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setr_pd(double __w, double __x)
+{
+  return (__m128d){ __w, __x };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [2 x double] with
+///    all elements set to zero.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits are set to the lower 64 bits of the second parameter. The upper
+///    64 bits are set to the upper 64 bits of the first parameter.
+//
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
+///    upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
+///    lower 64 bits of the result.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d){ __b[0], __a[1] };
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_sd(double *__dp, __m128d __a)
+{
+  struct __mm_store_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd(double *__dp, __m128d __a)
+{
+  *(__m128d*)__dp = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+  _mm_store_pd(__dp, __a);
+}
+
+/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd1(double *__dp, __m128d __a)
+{
+  return _mm_store1_pd(__dp, __a);
+}
+
+/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+  struct __storeu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pd*)__dp)->__v = __a;
+}
+
+/// \brief Stores two double-precision values, in reverse order, from a 128-bit
+///    vector of [2 x double] to a 16-byte aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to a shuffling instruction followed by a
+/// <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 16-byte aligned memory location that can store two
+///    double-precision values.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be reversed and
+///    stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
+  *(__m128d *)__dp = __a;
+}
+
+/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
+///    saving the lower 8 bits of each sum in the corresponding element of a
+///    128-bit result vector of [16 x i8]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qu)__a + (__v16qu)__b);
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
+///    saving the lower 16 bits of each sum in the corresponding element of a
+///    128-bit result vector of [8 x i16]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit vector of [8 x i16] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hu)__a + (__v8hu)__b);
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
+///    saving the lower 32 bits of each sum in the corresponding element of a
+///    128-bit result vector of [4 x i32]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4su)__a + (__v4su)__b);
+}
+
+/// \brief Adds two signed or unsigned 64-bit integer values, returning the
+///    lower 64 bits of the sum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer.
+/// \param __b
+///    A 64-bit integer.
+/// \returns A 64-bit integer containing the sum of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
+///    saving the lower 64 bits of each sum in the corresponding element of a
+///    128-bit result vector of [2 x i64]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+/// \returns A 128-bit vector of [2 x i64] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a + (__v2du)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [16 x i8] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
+///    saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [16 x i8] vector.
+/// \param __b
+///    A 128-bit signed [16 x i8] vector.
+/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
+///    both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [8 x i16] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
+///    are saturated to 7FFFh. Negative sums less than 8000h are saturated to
+///    8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
+///    both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
+///    are saturated to FFh. Negative sums are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
+///    of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
+///    are saturated to FFFFh. Negative sums are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
+///    of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [16 x i8] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
+///    averages of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [8 x i16] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
+///    averages of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, producing eight intermediate 32-bit signed integer products, and
+///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
+///    [4 x i32] vector. For example, bits [15:0] of both parameters are
+///    multiplied producing a 32-bit product, bits [31:16] of both parameters
+///    are multiplied producing a 32-bit product, and the sum of those two
+///    products becomes bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
+///    of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
+///    each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
+///    of each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the lower 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
+///    each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hu)__a * (__v8hu)__b);
+}
+
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
+///    of the two 64-bit integer vectors and returns the 64-bit unsigned
+///    product.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer containing one of the source operands.
+/// \param __b
+///    A 64-bit integer containing one of the source operands.
+/// \returns A 64-bit integer vector containing the product of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower
+///    bits of the corresponding elements of two [2 x i64] vectors, and returns
+///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
+///
+/// \param __a
+///    A [2 x i64] vector containing one of the source operands.
+/// \param __b
+///    A [2 x i64] vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the product of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Computes the absolute differences of corresponding 8-bit integer
+///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
+///    separately sums the second 8 absolute differences. Packss these two
+///    unsigned 16-bit integer sums into the upper and lower elements of a
+///    [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the sums of the sets of absolute
+///    differences between both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Subtracts the corresponding 8-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qu)__a - (__v16qu)__b);
+}
+
+/// \brief Subtracts the corresponding 16-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hu)__a - (__v8hu)__b);
+}
+
+/// \brief Subtracts the corresponding 32-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4su)__a - (__v4su)__b);
+}
+
+/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
+///    difference to the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the minuend.
+/// \param __b
+///    A 64-bit integer vector containing the subtrahend.
+/// \returns A 64-bit integer vector containing the difference of the values in
+///    the operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+}
+
+/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a - (__v2du)__b);
+}
+
+/// \brief Subtracts corresponding 8-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7Fh are saturated to 7Fh, and differences less
+///    than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Subtracts corresponding 16-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
+///    than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 00h are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise AND of the values
+///    in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a & (__v2du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
+///    one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing the left source operand. The one's complement
+///    of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector containing the right source operand.
+/// \returns A 128-bit integer vector containing the bitwise AND of the one's
+///    complement of the first operand and the values in the second operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)(~(__v2du)__a & (__v2du)__b);
+}
+/// \brief Performs a bitwise OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise OR of the values
+///    in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a | (__v2du)__b);
+}
+
+/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
+///    values in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a ^ (__v2du)__b);
+}
+
+/// \brief Left-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_slli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to left-shift operand
+///    \a a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
+#define _mm_slli_si128(a, imm) __extension__ ({                              \
+  (__m128i)__builtin_shufflevector(                                          \
+                                 (__v16qi)_mm_setzero_si128(),               \
+                                 (__v16qi)(__m128i)(a),                      \
+                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
+
+#define _mm_bslli_si128(a, imm) \
+  _mm_slli_si128((a), (imm))
+
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psllqi128((__v2di)__a, __count);
+}
+
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
+}
+
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+/// \brief Right-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_srli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to right-shift operand
+///    \a a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
+#define _mm_srli_si128(a, imm) __extension__ ({                              \
+  (__m128i)__builtin_shufflevector(                                          \
+                                 (__v16qi)(__m128i)(a),                      \
+                                 (__v16qi)_mm_setzero_si128(),               \
+                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
+                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
+                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
+                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
+                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
+                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
+                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
+                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
+                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
+                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
+                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
+                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
+                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
+                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
+                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
+                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
+
+#define _mm_bsrli_si128(a, imm) \
+  _mm_srli_si128((a), (imm))
+
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
+}
+
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
+}
+
+/// \brief Compares each of the corresponding 8-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+/// \brief Compares each of the corresponding 16-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false,
+///    FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are
+///    greater than those in the second operand. Each comparison yields 0h for
+///    false, FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+  /* This function always performs a signed comparison, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are less
+///    than those in the second operand. Each comparison yields 0h for false,
+///    FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi8(__b, __a);
+}
+
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi16(__b, __a);
+}
+
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+/// \brief Converts a 64-bit signed integer value from the second operand into a
+///    double-precision value and returns it in the lower element of a [2 x
+///    double] vector; the upper element of the returned vector is copied from
+///    the upper element of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
+///    copied to the upper 64 bits of the destination.
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied from
+///    the upper 64 bits of the first operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, according to the current rounding mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si64((__v2df)__a);
+}
+
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvttsd2si64((__v2df)__a);
+}
+#endif
+
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi32_ps(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+}
+
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit integer vector of [4 x i32] containing the converted
+///    values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
+}
+
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
+///    truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x i32] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
+}
+
+/// \brief Returns a vector of [4 x i32] where the lowest element is the input
+///    operand and the remaining elements are zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+///    A 32-bit signed integer operand.
+/// \returns A 128-bit vector of [4 x i32].
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si128(int __a)
+{
+  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+/// \brief Returns a vector of [2 x i64] where the lower element is the input
+///    operand and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x i64] containing the converted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si128(long long __a)
+{
+  return (__m128i){ __a, 0 };
+}
+#endif
+
+/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
+///    32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+///    A vector of [4 x i32]. The least significant 32 bits are moved to the
+///    destination.
+/// \returns A 32-bit signed integer containing the moved value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si32(__m128i __a)
+{
+  __v4si __b = (__v4si)__a;
+  return __b[0];
+}
+
+#ifdef __x86_64__
+/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
+///    64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A vector of [2 x i64]. The least significant 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit signed integer containing the moved value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si64(__m128i __a)
+{
+  return __a[0];
+}
+#endif
+
+/// \brief Moves packed integer values from an aligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
+///
+/// \param __p
+///    An aligned pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_load_si128(__m128i const *__p)
+{
+  return *__p;
+}
+
+/// \brief Moves packed integer values from an unaligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si128(__m128i const *__p)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si128*)__p)->__v;
+}
+
+/// \brief Returns a vector of [2 x i64] where the lower element is taken from
+///    the lower element of the operand, and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __p
+///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
+///    the destination.
+/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
+///    moved value. The higher order bits are cleared.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadl_epi64(__m128i const *__p)
+{
+  struct __mm_loadl_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
+///    This could be used as an argument to another intrinsic function where the
+///    argument is required but the value is not actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x i32] with unspecified content.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_undefined_si128(void)
+{
+  return (__m128i)__builtin_ia32_undef128();
+}
+
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64x(long long __q1, long long __q0)
+{
+  return (__m128i){ __q0, __q1 };
+}
+
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64(__m64 __q1, __m64 __q0)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [127:96] of the
+///    destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [95:64] of the destination
+///    vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of the destination
+///    vector.
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w7
+///    A 16-bit integer value used to initialize bits [127:112] of the
+///    destination vector.
+/// \param __w6
+///    A 16-bit integer value used to initialize bits [111:96] of the
+///    destination vector.
+/// \param __w5
+///    A 16-bit integer value used to initialize bits [95:80] of the destination
+///    vector.
+/// \param __w4
+///    A 16-bit integer value used to initialize bits [79:64] of the destination
+///    vector.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of the destination
+///    vector.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of the destination
+///    vector.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of the destination
+///    vector.
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b15
+///    Initializes bits [127:120] of the destination vector.
+/// \param __b14
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b13
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b12
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b11
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b10
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b9
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b8
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 128-bit vector of [16 x i8] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+/// \brief Initializes both values in a 128-bit integer vector with the
+///    specified 64-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    Integer value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit integer vector of [2 x i64] with both
+///    elements containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64x(long long __q)
+{
+  return (__m128i){ __q, __q };
+}
+
+/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
+///    specified 64-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    A 64-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [2 x i64] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64(__m64 __q)
+{
+  return (__m128i){ (long long)__q, (long long)__q };
+}
+
+/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
+///    specified 32-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i
+///    A 32-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi32(int __i)
+{
+  return (__m128i)(__v4si){ __i, __i, __i, __i };
+}
+
+/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
+///    specified 16-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w
+///    A 16-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi16(short __w)
+{
+  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
+///    specified 8-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b
+///    An 8-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [16 x i8] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi8(char __b)
+{
+  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+///   instruction.
+///
+/// \param __q0
+///    A 64-bit integral value used to initialize the lower 64 bits of the
+///    result.
+/// \param __q1
+///    A 64-bit integral value used to initialize the upper 64 bits of the
+///    result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi64(__m64 __q0, __m64 __q1)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+/// \brief Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+///    zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+/// \brief Stores a 128-bit integer vector to a memory location aligned on a
+///    128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that will receive the integer
+///    values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+  *__p = __b;
+}
+
+/// \brief Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si128(__m128i *__p, __m128i __b)
+{
+  struct __storeu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si128*)__p)->__v = __b;
+}
+
+/// \brief Moves bytes selected by the mask from the first operand to the
+///    specified unaligned memory location. When a mask bit is 1, the
+///    corresponding byte is written, otherwise it is not written. To minimize
+///    caching, the date is flagged as non-temporal (unlikely to be used again
+///    soon). Exception and trap behavior for elements not selected for storage
+///    to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
+///   instruction.
+///
+/// \param __d
+///    A 128-bit integer vector containing the values to be moved.
+/// \param __n
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each byte represents the mask bits.
+/// \param __p
+///    A pointer to an unaligned 128-bit memory location where the specified
+///    values are moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+///    a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the lower 64 bits
+///    of the integer vector parameter.
+/// \param __a
+///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+///    value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_epi64(__m128i *__p, __m128i __a)
+{
+  struct __mm_storel_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A vector of [2 x double] containing the 64-bit values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pd(double *__p, __m128d __a)
+{
+  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
+}
+
+/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A 128-bit integer vector containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
+}
+
+/// \brief Stores a 32-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
+///
+/// \param __p
+///    A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+///    A 32-bit integer containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si32(int *__p, int __a)
+{
+  __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+/// \brief Stores a 64-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
+///
+/// \param __p
+///    A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si64(long long *__p, long long __a)
+{
+  __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief The cache line containing \a __p is flushed and invalidated from all
+///    caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    flushed.
+void _mm_clflush(void const *);
+
+/// \brief Forces strong memory ordering (serialization) between load
+///    instructions preceding this instruction and load instructions following
+///    this instruction, ensuring the system completes all previous loads before
+///    executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
+///
+void _mm_lfence(void);
+
+/// \brief Forces strong memory ordering (serialization) between load and store
+///    instructions preceding this instruction and load and store instructions
+///    following this instruction, ensuring that the system completes all
+///    previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
+///
+void _mm_mfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
+///
+/// \param __a
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the lower 64 bits of the result.
+/// \param __b
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+///    the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __imm
+///    An immediate value. Bits [3:0] selects values from \a __a to be assigned
+///    to bits[15:0] of the result. \n
+///    000: assign values from bits [15:0] of \a __a. \n
+///    001: assign values from bits [31:16] of \a __a. \n
+///    010: assign values from bits [47:32] of \a __a. \n
+///    011: assign values from bits [63:48] of \a __a. \n
+///    100: assign values from bits [79:64] of \a __a. \n
+///    101: assign values from bits [95:80] of \a __a. \n
+///    110: assign values from bits [111:96] of \a __a. \n
+///    111: assign values from bits [127:112] of \a __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_extract_epi16(__m128i __a, int __imm)
+{
+  __v8hi __b = (__v8hi)__a;
+  return (unsigned short)__b[__imm & 7];
+}
+
+/// \brief Constructs a 128-bit integer vector by first making a copy of the
+///    128-bit integer vector parameter, and then inserting the lower 16 bits
+///    of an integer parameter into an offset specified by the immediate-value
+///    parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
+///    result and then one of the eight elements in the result is replaced by
+///    the lower 16 bits of \a __b.
+/// \param __b
+///    An integer. The lower 16 bits of this parameter are written to the
+///    result beginning at an offset specified by \a __imm.
+/// \param __imm
+///    An immediate value specifying the bit offset in the result at which the
+///    lower 16 bits of \a __b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_epi16(__m128i __a, int __b, int __imm)
+{
+  __v8hi __c = (__v8hi)__a;
+  __c[__imm & 7] = __b;
+  return (__m128i)__c;
+}
+
+/// \brief Copies the values of the most significant bits from each 8-bit
+///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+///    value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in \a __a,
+///    written to bits [15:0]. The other bits are assigned zeros.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_epi8(__m128i __a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
+///    elements of a 128-bit integer vector parameter, using the immediate-value
+///    parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a. The destinations within the 128-bit destination are assigned
+///    values as follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
+///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
+///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
+///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [31:0] of \a a. \n
+///    01: assign values from bits [63:32] of \a a. \n
+///    10: assign values from bits [95:64] of \a a. \n
+///    11: assign values from bits [127:96] of \a a.
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
+                                   (__v4si)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
+
+/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+///    [127:64] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
+///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
+///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
+///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [15:0] of \a a. \n
+///    01: assign values from bits [31:16] of \a a. \n
+///    10: assign values from bits [47:32] of \a a. \n
+///    11: assign values from bits [63:48] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
+                                   4, 5, 6, 7); })
+
+/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+///    [63:0] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
+///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
+///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
+///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [79:64] of \a a. \n
+///    01: assign values from bits [95:80] of \a a. \n
+///    10: assign values from bits [111:96] of \a a. \n
+///    11: assign values from bits [127:112] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_undefined_si128(), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })
+
+/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
+///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [7:0] of the result. \n
+///    Bits [79:72] are written to bits [23:16] of the result. \n
+///    Bits [87:80] are written to bits [39:32] of the result. \n
+///    Bits [95:88] are written to bits [55:48] of the result. \n
+///    Bits [103:96] are written to bits [71:64] of the result. \n
+///    Bits [111:104] are written to bits [87:80] of the result. \n
+///    Bits [119:112] are written to bits [103:96] of the result. \n
+///    Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [71:64] are written to bits [15:8] of the result. \n
+///    Bits [79:72] are written to bits [31:24] of the result. \n
+///    Bits [87:80] are written to bits [47:40] of the result. \n
+///    Bits [95:88] are written to bits [63:56] of the result. \n
+///    Bits [103:96] are written to bits [79:72] of the result. \n
+///    Bits [111:104] are written to bits [95:88] of the result. \n
+///    Bits [119:112] are written to bits [111:104] of the result. \n
+///    Bits [127:120] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [15:0] of the result. \n
+///    Bits [95:80] are written to bits [47:32] of the result. \n
+///    Bits [111:96] are written to bits [79:64] of the result. \n
+///    Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [31:16] of the result. \n
+///    Bits [95:80] are written to bits [63:48] of the result. \n
+///    Bits [111:96] are written to bits [95:80] of the result. \n
+///    Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [64:32] of the destination. \n
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
+}
+
+/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
+///    Bits [31:24] are written to bits [55:48] of the result. \n
+///    Bits [39:32] are written to bits [71:64] of the result. \n
+///    Bits [47:40] are written to bits [87:80] of the result. \n
+///    Bits [55:48] are written to bits [103:96] of the result. \n
+///    Bits [63:56] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
+///    Bits [31:24] are written to bits [63:56] of the result. \n
+///    Bits [39:32] are written to bits [79:72] of the result. \n
+///    Bits [47:40] are written to bits [95:88] of the result. \n
+///    Bits [55:48] are written to bits [111:104] of the result. \n
+///    Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
+///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
+///    [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result. \n
+///    Bits [31:16] are written to bits [47:32] of the result. \n
+///    Bits [47:32] are written to bits [79:64] of the result. \n
+///    Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result. \n
+///    Bits [31:16] are written to bits [63:48] of the result. \n
+///    Bits [47:32] are written to bits [95:80] of the result. \n
+///    Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination. \n
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [64:32] of the destination. \n
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination. \n
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination. \n
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
+}
+
+/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+///    integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_movepi64_pi64(__m128i __a)
+{
+  return (__m64)__a[0];
+}
+
+/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+///    upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
+///
+/// \param __a
+///    A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movpi64_epi64(__m64 __a)
+{
+  return (__m128i){ (long long)__a, 0 };
+}
+
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+///    integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_move_epi64(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
+}
+
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
+}
+
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
+}
+
+/// \brief Extracts the sign bits of the double-precision values in the 128-bit
+///    vector of [2 x double], zero-extends the value, and writes it to the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values with sign bits to
+///    be extracted.
+/// \returns The sign bits from each of the double-precision elements in \a __a,
+///    written to bits [1:0]. The remaining bits are assigned values of zero.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pd(__m128d __a)
+{
+  return __builtin_ia32_movmskpd((__v2df)__a);
+}
+
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
+///    128-bit vector parameters of [2 x double], using the immediate-value
+///     parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param i
+///    An 8-bit immediate value. The least significant two bits specify which
+///    elements to copy from a and b: \n
+///    Bit[0] = 0: lower element of a copied to lower element of result. \n
+///    Bit[0] = 1: upper element of a copied to lower element of result. \n
+///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
+///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                   0 + (((i) >> 0) & 0x1), \
+                                   2 + (((i) >> 1) & 0x1)); })
+
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castpd_ps(__m128d __a)
+{
+  return (__m128)__a;
+}
+
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castpd_si128(__m128d __a)
+{
+  return (__m128i)__a;
+}
+
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castps_pd(__m128 __a)
+{
+  return (__m128d)__a;
+}
+
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castps_si128(__m128 __a)
+{
+  return (__m128i)__a;
+}
+
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castsi128_ps(__m128i __a)
+{
+  return (__m128)__a;
+}
+
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castsi128_pd(__m128i __a)
+{
+  return (__m128d)__a;
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief Indicates that a spin loop is being executed for the purposes of
+///    optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
+///
+void _mm_pause(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#undef __DEFAULT_FN_ATTRS
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __EMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/f16cintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/f16cintrin.h
new file mode 100644
index 000000000..180712ffc
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/f16cintrin.h
@@ -0,0 +1,124 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
+#endif
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+
+/// \brief Converts a 16-bit half-precision float value into a 32-bit float
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 16-bit half-precision float value.
+/// \returns The converted 32-bit float value.
+static __inline float __DEFAULT_FN_ATTRS
+_cvtsh_ss(unsigned short __a)
+{
+  __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
+  __v4sf r = __builtin_ia32_vcvtph2ps(v);
+  return r[0];
+}
+
+/// \brief Converts a 32-bit single-precision float value to a 16-bit
+///    half-precision float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _cvtss_sh(float a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 32-bit single-precision float value to be converted to a 16-bit
+///    half-precision float value.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns The converted 16-bit half-precision float value.
+#define _cvtss_sh(a, imm)  \
+  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                      (imm)))[0]))
+
+/// \brief Converts a 128-bit vector containing 32-bit float values into a
+///    128-bit vector containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 128-bit vector containing 32-bit float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing converted 16-bit half-precision float
+///    values. The lower 64 bits are used to store the converted 16-bit
+///    half-precision floating-point values.
+#define _mm_cvtps_ph(a, imm) \
+  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
+
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 128-bit vector containing 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values. The lower
+///    64 bits are used in the conversion.
+/// \returns A 128-bit vector of [4 x float] containing converted float values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __F16CINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/float.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/float.h
new file mode 100644
index 000000000..0f453d87c
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/float.h
@@ -0,0 +1,137 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ *
+ * Also fall back on Darwin to allow additional definitions and
+ * implementation-defined values.
+ */
+#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
+    __STDC_HOSTED__ && __has_include_next(<float.h>)
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#    undef DECIMAL_DIG
+#  endif
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#    undef FLT_DECIMAL_DIG
+#    undef DBL_DECIMAL_DIG
+#    undef LDBL_DECIMAL_DIG
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#  define DECIMAL_DIG __DECIMAL_DIG__
+#endif
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
+#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
+#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
+#endif
+
+#endif /* __FLOAT_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fma4intrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fma4intrin.h
new file mode 100644
index 000000000..11aa8ceac
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fma4intrin.h
@@ -0,0 +1,230 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMA4INTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fmaintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fmaintrin.h
new file mode 100644
index 000000000..0e2ef0b17
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fmaintrin.h
@@ -0,0 +1,228 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMAINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fxsrintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fxsrintrin.h
new file mode 100644
index 000000000..786081ca8
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/fxsrintrin.h
@@ -0,0 +1,105 @@
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
+
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p)
+{
+  return __builtin_ia32_fxsave(__p);
+}
+
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p)
+{
+  return __builtin_ia32_fxrstor(__p);
+}
+
+#ifdef __x86_64__
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p)
+{
+  return __builtin_ia32_fxsave64(__p);
+}
+
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p)
+{
+  return __builtin_ia32_fxrstor64(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/htmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/htmintrin.h
new file mode 100644
index 000000000..69c8d7bb5
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/htmintrin.h
@@ -0,0 +1,226 @@
+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMINTRIN_H
+#define __HTMINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#ifdef __powerpc__
+
+#include <stdint.h>
+
+typedef uint64_t texasr_t;
+typedef uint32_t texasru_t;
+typedef uint32_t texasrl_t;
+typedef uintptr_t tfiar_t;
+typedef uintptr_t tfhar_t;
+
+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
+#define _HTM_NONTRANSACTIONAL 0x0
+#define _HTM_SUSPENDED        0x1
+#define _HTM_TRANSACTIONAL    0x2
+
+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+
+#define _TEXASR_FAILURE_CODE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
+
+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _TEXASR_DISALLOWED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
+#define _TEXASRU_DISALLOWED(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
+
+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
+
+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
+
+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
+
+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
+
+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
+
+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
+
+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
+
+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
+
+#define _TEXASR_ABORT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
+#define _TEXASRU_ABORT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
+
+
+#define _TEXASR_SUSPENDED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
+
+#define _TEXASR_PRIVILEGE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
+
+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
+
+#define _TEXASR_TFIAR_EXACT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
+
+#define _TEXASR_ROT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
+
+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
+
+#endif /* __powerpc */
+
+#ifdef __s390__
+
+/* Condition codes generated by tbegin  */
+#define _HTM_TBEGIN_STARTED       0
+#define _HTM_TBEGIN_INDETERMINATE 1
+#define _HTM_TBEGIN_TRANSIENT     2
+#define _HTM_TBEGIN_PERSISTENT    3
+
+/* The abort codes below this threshold are reserved for machine use.  */
+#define _HTM_FIRST_USER_ABORT_CODE 256
+
+/* The transaction diagnostic block is it is defined in the Principles
+   of Operation chapter 5-91.  */
+
+struct __htm_tdb {
+  unsigned char format;                /*   0 */
+  unsigned char flags;
+  unsigned char reserved1[4];
+  unsigned short nesting_depth;
+  unsigned long long abort_code;       /*   8 */
+  unsigned long long conflict_token;   /*  16 */
+  unsigned long long atia;             /*  24 */
+  unsigned char eaid;                  /*  32 */
+  unsigned char dxc;
+  unsigned char reserved2[2];
+  unsigned int program_int_id;
+  unsigned long long exception_id;     /*  40 */
+  unsigned long long bea;              /*  48 */
+  unsigned char reserved3[72];         /*  56 */
+  unsigned long long gprs[16];         /* 128 */
+} __attribute__((__packed__, __aligned__ (8)));
+
+
+/* Helper intrinsics to retry tbegin in case of transient failure.  */
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_null (int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_tdb (void *__tdb, int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_null(retry) : \
+   __builtin_tbegin_retry_tdb(tdb, retry))
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_null (int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_tdb (void *__tdb, int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_nofloat_null(retry) : \
+   __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
+
+#endif /* __s390__ */
+
+#endif /* __HTMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/htmxlintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/htmxlintrin.h
new file mode 100644
index 000000000..16dc7056c
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/htmxlintrin.h
@@ -0,0 +1,363 @@
+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMXLINTRIN_H
+#define __HTMXLINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#include <htmintrin.h>
+
+#ifdef __powerpc__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _TEXASR_PTR(TM_BUF) \
+  ((texasr_t *)((TM_BUF)+0))
+#define _TEXASRU_PTR(TM_BUF) \
+  ((texasru_t *)((TM_BUF)+0))
+#define _TEXASRL_PTR(TM_BUF) \
+  ((texasrl_t *)((TM_BUF)+4))
+#define _TFIAR_PTR(TM_BUF) \
+  ((tfiar_t *)((TM_BUF)+8))
+
+typedef char TM_buff_type[16];
+
+/* This macro can be used to determine whether a transaction was successfully
+   started from the __TM_begin() and __TM_simple_begin() intrinsic functions
+   below.  */
+#define _HTM_TBEGIN_STARTED     1
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_simple_begin (void)
+{
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_begin (void* const __TM_buff)
+{
+  *_TEXASRL_PTR (__TM_buff) = 0;
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+#ifdef __powerpc64__
+  *_TEXASR_PTR (__TM_buff) = __builtin_get_texasr ();
+#else
+  *_TEXASRU_PTR (__TM_buff) = __builtin_get_texasru ();
+  *_TEXASRL_PTR (__TM_buff) = __builtin_get_texasr ();
+#endif
+  *_TFIAR_PTR (__TM_buff) = __builtin_get_tfiar ();
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_end (void)
+{
+  if (__builtin_expect (__builtin_tend (0), 1))
+    return 1;
+  return 0;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_abort (void)
+{
+  __builtin_tabort (0);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_named_abort (unsigned char const __code)
+{
+  __builtin_tabort (__code);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_resume (void)
+{
+  __builtin_tresume ();
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_suspend (void)
+{
+  __builtin_tsuspend ();
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_user_abort (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_named_user_abort (void* const __TM_buff, unsigned char *__code)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+
+  *__code = _TEXASRU_FAILURE_CODE (texasru);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_illegal (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_DISALLOWED (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_footprint_exceeded (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_nesting_depth (void* const __TM_buff)
+{
+  texasrl_t texasrl;
+
+  if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
+    {
+      texasrl = *_TEXASRL_PTR (__TM_buff);
+      if (!_TEXASR_FAILURE_SUMMARY (texasrl))
+        texasrl = 0;
+    }
+  else
+    texasrl = (texasrl_t) __builtin_get_texasr ();
+
+  return _TEXASR_TRANSACTION_LEVEL (texasrl);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_nested_too_deep(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_NESTING_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_conflict(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  /* Return TEXASR bits 11 (Self-Induced Conflict) through
+     14 (Translation Invalidation Conflict).  */
+  return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_failure_persistent(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_FAILURE_PERSISTENT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_address(void* const __TM_buff)
+{
+  return *_TFIAR_PTR (__TM_buff);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_code(void* const __TM_buff)
+{
+  return *_TEXASR_PTR (__TM_buff);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __powerpc__ */
+
+#ifdef __s390__
+
+#include <stdint.h>
+
+/* These intrinsics are being made available for compatibility with
+   the IBM XL compiler.  For documentation please see the "z/OS XL
+   C/C++ Programming Guide" publically available on the web.  */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_simple_begin ()
+{
+  return __builtin_tbegin_nofloat (0);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_begin (void* const __tdb)
+{
+  return __builtin_tbegin_nofloat (__tdb);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_end ()
+{
+  return __builtin_tend ();
+}
+
+static __inline void __attribute__((__always_inline__))
+__TM_abort ()
+{
+  return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_named_abort (unsigned char const __code)
+{
+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + __code);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_non_transactional_store (void* const __addr, long long const __value)
+{
+  __builtin_non_tx_store ((uint64_t*)__addr, (uint64_t)__value);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_nesting_depth (void* const __tdb_ptr)
+{
+  int depth = __builtin_tx_nesting_depth ();
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (depth != 0)
+    return depth;
+
+  if (tdb->format != 1)
+    return 0;
+  return tdb->nesting_depth;
+}
+
+/* Transaction failure diagnostics */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_user_abort (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_named_user_abort (void* const __tdb_ptr, unsigned char* __code)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
+    {
+      *__code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+      return 1;
+    }
+  return 0;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_illegal (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 4 /* unfiltered program interruption */
+	      || tdb->abort_code == 11 /* restricted instruction */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_footprint_exceeded (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 7 /* fetch overflow */
+	      || tdb->abort_code == 8 /* store overflow */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_nested_too_deep (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_conflict (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 9 /* fetch conflict */
+	      || tdb->abort_code == 10 /* store conflict */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_failure_persistent (long const __result)
+{
+  return __result == _HTM_TBEGIN_PERSISTENT;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_address (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+  return tdb->atia;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_code (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return tdb->abort_code;
+}
+
+#endif /* __s390__ */
+
+#endif /* __HTMXLINTRIN_H  */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/ia32intrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/ia32intrin.h
new file mode 100644
index 000000000..492830010
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/ia32intrin.h
@@ -0,0 +1,73 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __builtin_ia32_writeeflags_u64(__f);
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u32();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __builtin_ia32_writeeflags_u32(__f);
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#define _rdpmc(A) __rdpmc(A)
+
+#endif /* __IA32INTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/immintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/immintrin.h
new file mode 100644
index 000000000..7f91d49fb
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/immintrin.h
@@ -0,0 +1,318 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
+#include <mmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
+#include <pmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__SSE4_2__) || defined(__SSE4_1__))
+#include <smmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AES__) || defined(__PCLMUL__))
+#include <wmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
+#include <clflushoptintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
+#include <avxintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
+#include <avx2intrin.h>
+
+/* The 256-bit versions of functions in f16cintrin.h.
+   Intel documents these as being in immintrin.h, and
+   they depend on typedefs from avxintrin.h. */
+
+/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float values to be
+///    converted to 16-bit half-precision float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+///    float values.
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
+
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values to be
+///    converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
+static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+#endif /* __AVX2__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+#include <bmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
+#include <bmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
+#include <lzcntintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
+#include <fmaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
+#include <avx512fintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
+#include <avx512vlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
+#include <avx512bwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
+#include <avx512cdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
+#include <avx512dqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BW__))
+#include <avx512vlbwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512CD__))
+#include <avx512vlcdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512DQ__))
+#include <avx512vldqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
+#include <avx512erintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
+#include <avx512ifmaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
+#include <avx512ifmavlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
+#include <avx512vbmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
+#include <avx512vbmivlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
+#include <avx512pfintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
+#include <pkuintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+/* __bit_scan_forward */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_forward(int __A) {
+  return __builtin_ctz(__A);
+}
+
+/* __bit_scan_reverse */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_reverse(int __A) {
+  return 31 - __builtin_clz(__A);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
+#ifdef __x86_64__
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u32(void)
+{
+  return __builtin_ia32_rdfsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u64(void)
+{
+  return __builtin_ia32_rdfsbase64();
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u32(void)
+{
+  return __builtin_ia32_rdgsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u64(void)
+{
+  return __builtin_ia32_rdgsbase64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrfsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrfsbase64(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrgsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrgsbase64(__V);
+}
+
+#endif
+#endif /* __FSGSBASE__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
+#include <rtmintrin.h>
+#include <xtestintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
+#include <shaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
+#include <fxsrintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
+#include <xsaveintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
+#include <xsaveoptintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
+#include <xsavecintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
+#include <xsavesintrin.h>
+#endif
+
+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
+ * whereas others are also available at all times. */
+#include <adxintrin.h>
+
+#endif /* __IMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/intrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/intrin.h
new file mode 100644
index 000000000..a35262af8
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/intrin.h
@@ -0,0 +1,1021 @@
+/* ===-------- intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+#if defined(__arm__)
+#include <armintr.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+float _m_to_float(__m64);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+static __inline__
+__int64 __emul(int, int);
+static __inline__
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+static __inline__
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+static __inline__
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+static __inline__
+__int64 _mul128(__int64, __int64, __int64*);
+static __inline__
+unsigned __int64 _umul128(unsigned __int64,
+                          unsigned __int64,
+                          unsigned __int64*);
+
+#endif /* __x86_64__ */
+
+#if defined(__x86_64__) || defined(__arm__)
+
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest(long const *_BitBase, long _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
+  return (_PrevVal >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
+  return (_PrevVal >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
+  long long _PrevVal =
+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_acq(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_nf(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_rel(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_acq(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_nf(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_rel(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_acq(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_nf(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_rel(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_acq(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_nf(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_rel(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_acq(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_nf(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_rel(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_acq(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_nf(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_rel(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_acq(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_nf(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_rel(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_acq(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_nf(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_rel(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_acq(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_nf(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_rel(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_acq(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_nf(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_rel(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_acq(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_nf(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_rel(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_acq(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_nf(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_rel(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__nop(void) {
+  __asm__ volatile ("nop");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/inttypes.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/inttypes.h
new file mode 100644
index 000000000..1d8eabab0
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/inttypes.h
@@ -0,0 +1,106 @@
+/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_INTTYPES_H
+#define __CLANG_INTTYPES_H
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#error MSVC does not have inttypes.h prior to Visual Studio 2013
+#endif
+
+#include_next <inttypes.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+/* MSVC headers define int32_t as int, but PRIx32 as "lx" instead of "x".
+ * This triggers format warnings, so fix it up here. */
+#undef PRId32
+#undef PRIdLEAST32
+#undef PRIdFAST32
+#undef PRIi32
+#undef PRIiLEAST32
+#undef PRIiFAST32
+#undef PRIo32
+#undef PRIoLEAST32
+#undef PRIoFAST32
+#undef PRIu32
+#undef PRIuLEAST32
+#undef PRIuFAST32
+#undef PRIx32
+#undef PRIxLEAST32
+#undef PRIxFAST32
+#undef PRIX32
+#undef PRIXLEAST32
+#undef PRIXFAST32
+
+#undef SCNd32
+#undef SCNdLEAST32
+#undef SCNdFAST32
+#undef SCNi32
+#undef SCNiLEAST32
+#undef SCNiFAST32
+#undef SCNo32
+#undef SCNoLEAST32
+#undef SCNoFAST32
+#undef SCNu32
+#undef SCNuLEAST32
+#undef SCNuFAST32
+#undef SCNx32
+#undef SCNxLEAST32
+#undef SCNxFAST32
+
+#define PRId32 "d"
+#define PRIdLEAST32 "d"
+#define PRIdFAST32 "d"
+#define PRIi32 "i"
+#define PRIiLEAST32 "i"
+#define PRIiFAST32 "i"
+#define PRIo32 "o"
+#define PRIoLEAST32 "o"
+#define PRIoFAST32 "o"
+#define PRIu32 "u"
+#define PRIuLEAST32 "u"
+#define PRIuFAST32 "u"
+#define PRIx32 "x"
+#define PRIxLEAST32 "x"
+#define PRIxFAST32 "x"
+#define PRIX32 "X"
+#define PRIXLEAST32 "X"
+#define PRIXFAST32 "X"
+
+#define SCNd32 "d"
+#define SCNdLEAST32 "d"
+#define SCNdFAST32 "d"
+#define SCNi32 "i"
+#define SCNiLEAST32 "i"
+#define SCNiFAST32 "i"
+#define SCNo32 "o"
+#define SCNoLEAST32 "o"
+#define SCNoFAST32 "o"
+#define SCNu32 "u"
+#define SCNuLEAST32 "u"
+#define SCNuFAST32 "u"
+#define SCNx32 "x"
+#define SCNxLEAST32 "x"
+#define SCNxFAST32 "x"
+#endif
+
+#endif /* __CLANG_INTTYPES_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/iso646.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/iso646.h
new file mode 100644
index 000000000..dca13c5ba
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/iso646.h
@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/limits.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/limits.h
new file mode 100644
index 000000000..f04187ced
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/limits.h
@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/lzcntintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/lzcntintrin.h
new file mode 100644
index 000000000..3d2769da3
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/lzcntintrin.h
@@ -0,0 +1,118 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__lzcnt16(unsigned short __X)
+{
+  return __X ? __builtin_clzs(__X) : 16;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__lzcnt32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_lzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+#ifdef __x86_64__
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__lzcnt64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_lzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LZCNTINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mm3dnow.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mm3dnow.h
new file mode 100644
index 000000000..294866c1d
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mm3dnow.h
@@ -0,0 +1,171 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_femms(void) {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/* Handle the 3dnowa instructions here. */
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mm_malloc.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mm_malloc.h
new file mode 100644
index 000000000..305afd31a
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mm_malloc.h
@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mmintrin.h
new file mode 100644
index 000000000..e0c277a65
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mmintrin.h
@@ -0,0 +1,1549 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef long long __v1di __attribute__((__vector_size__(8)));
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+
+/// \brief Clears the MMX state by setting the state of the x87 stack registers
+///    to empty.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EMMS </c> instruction.
+///
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+/// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the
+///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __i
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
+///    parameter. The upper 32 bits are set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+/// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
+///    signed integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector.
+/// \returns A 32-bit signed integer value containing the lower 32 bits of the
+///    parameter.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si32(__m64 __m)
+{
+    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+/// \brief Casts a 64-bit signed integer value into a 64-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
+///
+/// \param __i
+///    A 64-bit signed integer.
+/// \returns A 64-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+/// \brief Casts a 64-bit integer vector into a 64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector.
+/// \returns A 64-bit signed integer containing the same bitwise pattern as the
+///    parameter.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
+///    a 64-bit integer vector of [8 x i8] as the result. Positive values
+///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
+///    are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit signed integer with
+///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80. The converted
+///    [4 x i8] values are written to the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit signed integer with
+///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80. The converted
+///    [4 x i8] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Converts 32-bit signed integers from both 64-bit integer vector
+///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
+///    a 64-bit integer vector of [4 x i16] as the result. Positive values
+///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
+///    0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+///    32-bit signed integer and is converted to a 16-bit signed integer with
+///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000. The converted
+///    [2 x i16] values are written to the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+///    32-bit signed integer and is converted to a 16-bit signed integer with
+///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000. The converted
+///    [2 x i16] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+///    parameters of [4 x i16] into 8-bit unsigned integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
+///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
+///    to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0 are saturated to 0. The converted [4 x i8] values are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0 are saturated to 0. The converted [4 x i8] values are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
+///    and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8]. \n 
+///    Bits [39:32] are written to bits [7:0] of the result. \n
+///    Bits [47:40] are written to bits [23:16] of the result. \n
+///    Bits [55:48] are written to bits [39:32] of the result. \n
+///    Bits [63:56] are written to bits [55:48] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [39:32] are written to bits [15:8] of the result. \n
+///    Bits [47:40] are written to bits [31:24] of the result. \n
+///    Bits [55:48] are written to bits [47:40] of the result. \n
+///    Bits [63:56] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [47:32] are written to bits [15:0] of the result. \n
+///    Bits [63:48] are written to bits [47:32] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [47:32] are written to bits [31:16] of the result. \n
+///    Bits [63:48] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
+///    and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
+///    Bits [31:24] are written to bits [55:48] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
+///    Bits [31:24] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result. \n
+///    Bits [31:16] are written to bits [47:32] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result. \n
+///    Bits [31:16] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Adds each 8-bit integer element of the first 64-bit integer vector
+///    of [8 x i8] to the corresponding 8-bit integer element of the second
+///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
+///    packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Adds each 16-bit integer element of the first 64-bit integer vector
+///    of [4 x i16] to the corresponding 16-bit integer element of the second
+///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
+///    packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Adds each 32-bit integer element of the first 64-bit integer vector
+///    of [2 x i32] to the corresponding 32-bit integer element of the second
+///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
+///    packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Adds each 8-bit signed integer element of the first 64-bit integer
+///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
+///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
+///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
+///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
+///    of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Adds each 16-bit signed integer element of the first 64-bit integer
+///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
+///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
+///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
+///    saturated to 0x8000. The results are packed into a 64-bit integer vector
+///    of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
+///    of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer
+///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
+///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
+///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
+///    [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    unsigned sums of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer
+///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
+///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
+///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    unsigned sums of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Subtracts each 8-bit integer element of the second 64-bit integer
+///    vector of [8 x i8] from the corresponding 8-bit integer element of the
+///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
+///    are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
+///    both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Subtracts each 16-bit integer element of the second 64-bit integer
+///    vector of [4 x i16] from the corresponding 16-bit integer element of the
+///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
+///    results are packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
+///    both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Subtracts each 32-bit integer element of the second 64-bit integer
+///    vector of [2 x i32] from the corresponding 32-bit integer element of the
+///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
+///    results are packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
+/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
+///    both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Subtracts each 8-bit signed integer element of the second 64-bit
+///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
+///    element of the first 64-bit integer vector of [8 x i8]. Positive results
+///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
+///    are saturated to 0x80. The results are packed into a 64-bit integer
+///    vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Subtracts each 16-bit signed integer element of the second 64-bit
+///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
+///    element of the first 64-bit integer vector of [4 x i16]. Positive results
+///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
+///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
+///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
+///    element of the first 64-bit integer vector of [8 x i8]. If an element of
+///    the first vector is less than the corresponding element of the second
+///    vector, the result is saturated to 0. The results are packed into a
+///    64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
+///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
+///    integer element of the first 64-bit integer vector of [4 x i16]. If an
+///    element of the first vector is less than the corresponding element of the
+///    second vector, the result is saturated to 0. The results are packed into
+///    a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16] and get four
+///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
+///    The lower 32 bits of these two sums are packed into a 64-bit integer
+///    vector of [2 x i32]. For example, bits [15:0] of both parameters are
+///    multiplied, bits [31:16] of both parameters are multiplied, and the sum
+///    of both results is written to bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
+///    products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
+///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
+///    of the products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
+///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
+///    of the products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Left-shifts each 16-bit signed integer element of the first
+///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
+///    of bits specified by the second parameter, which is a 64-bit integer. The
+///    lower 16 bits of the results are packed into a 64-bit integer vector of
+///    [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+/// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer
+///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    The lower 16 bits of the results are packed into a 64-bit integer vector
+///    of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+}
+
+/// \brief Left-shifts each 32-bit signed integer element of the first
+///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
+///    of bits specified by the second parameter, which is a 64-bit integer. The
+///    lower 32 bits of the results are packed into a 64-bit integer vector of
+///    [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+/// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer
+///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    The lower 32 bits of the results are packed into a 64-bit integer vector
+///    of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+/// \brief Left-shifts the first 64-bit integer parameter by the number of bits
+///    specified by the second 64-bit integer parameter. The lower 64 bits of
+///    result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+///     \a __count is greater or equal to 64, the result is set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+}
+
+/// \brief Left-shifts the first parameter, which is a 64-bit integer, by the
+///    number of bits specified by the second parameter, which is a 32-bit
+///    integer. The lower 64 bits of result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+///     \a __count is greater or equal to 64, the result is set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [4 x i16], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are filled with the sign bit of the initial value of each 16-bit
+///    element. The 16-bit results are packed into a 64-bit integer vector of
+///    [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    High-order bits are filled with the sign bit of the initial value of each
+///    16-bit element. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [2 x i32], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are filled with the sign bit of the initial value of each 32-bit
+///    element. The 32-bit results are packed into a 64-bit integer vector of
+///    [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    High-order bits are filled with the sign bit of the initial value of each
+///    32-bit element. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [4 x i16], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are cleared. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [2 x i32], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are cleared. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
+///    integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
+///    specified by the second 64-bit integer parameter. High-order bits are
+///    cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the right-shifted value.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+}
+
+/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
+///    number of bits specified by the second parameter, which is a 32-bit
+///    integer. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the right-shifted value.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+}
+
+/// \brief Performs a bitwise AND of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAND </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then
+///    performs a bitwise AND of the intermediate result and the second 64-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PANDN </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector. The one's complement of this parameter is used
+///    in the bitwise AND.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of the second
+///    parameter and the one's complement of the first parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Performs a bitwise OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POR </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise OR of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+///    [8 x i8] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+///    [4 x i16] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+///    [2 x i32] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+///    [8 x i8] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+///    [4 x i16] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+///    [2 x i32] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Constructs a 64-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 64-bit integer vector with all elements set to zero.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i1
+///    A 32-bit integer value used to initialize the upper 32 bits of the
+///    result.
+/// \param __i0
+///    A 32-bit integer value used to initialize the lower 32 bits of the
+///    result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __s3
+///    A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \param __s2
+///    A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __s1
+///    A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __s0
+///    A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b7
+///    An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \param __b6
+///    An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b5
+///    An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b4
+///    An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b3
+///    An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b2
+///    An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b1
+///    An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b0
+///    An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+                                               __b4, __b5, __b6, __b7);
+}
+
+/// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the
+///    32-bit integer vector elements set to the specified 32-bit integer
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param __i
+///    A 32-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [2 x i32].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi32(int __i)
+{
+    return _mm_set_pi32(__i, __i);
+}
+
+/// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the
+///    16-bit integer vector elements set to the specified 16-bit integer
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param __w
+///    A 16-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [4 x i16].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi16(short __w)
+{
+    return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+/// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the
+///    8-bit integer vector elements set to the specified 8-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW +
+///    PSHUFLW </c> instruction.
+///
+/// \param __b
+///    An 8-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [8 x i8].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi8(char __b)
+{
+    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integer value used to initialize the lower 32 bits of the
+///    result.
+/// \param __i1
+///    A 32-bit integer value used to initialize the upper 32 bits of the
+///    result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi32(int __i0, int __i1)
+{
+    return _mm_set_pi32(__i1, __i0);
+}
+
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+             char __b6, char __b7)
+{
+    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_from_int64 _mm_cvtsi64_m64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_to_int64 _mm_cvtm64_si64
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMINTRIN_H */
+
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/module.modulemap b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/module.modulemap
new file mode 100644
index 000000000..11ef2f902
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/module.modulemap
@@ -0,0 +1,166 @@
+/*===---- module.modulemap - intrinsics module map -------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+module _Builtin_intrinsics [system] [extern_c] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module acle {
+      header "arm_acle.h"
+      export *
+    }
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    textual header "f16cintrin.h"
+    textual header "avxintrin.h"
+    textual header "avx2intrin.h"
+    textual header "avx512fintrin.h"
+    textual header "avx512erintrin.h"
+    textual header "fmaintrin.h"
+
+    header "x86intrin.h"
+    textual header "bmiintrin.h"
+    textual header "bmi2intrin.h"
+    textual header "lzcntintrin.h"
+    textual header "xopintrin.h"
+    textual header "fma4intrin.h"
+    textual header "mwaitxintrin.h"
+
+    explicit module mm_malloc {
+      requires !freestanding
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      requires gnuinlineasm
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      header "mmintrin.h"
+    }
+
+    explicit module sse {
+      export mm_malloc
+      export mmx
+      export sse2 // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module popcnt {
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      header "mm3dnow.h"
+    }
+
+    explicit module aes_pclmul {
+      header "wmmintrin.h"
+      export aes
+      export pclmul
+    }
+
+    explicit module aes {
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+
+  explicit module systemz {
+    requires systemz
+    export *
+
+    header "s390intrin.h"
+
+    explicit module htm {
+      requires htm
+      header "htmintrin.h"
+      header "htmxlintrin.h"
+    }
+
+    explicit module zvector {
+      requires zvector, vx
+      header "vecintrin.h"
+    }
+  }
+}
+
+module _Builtin_stddef_max_align_t [system] [extern_c] {
+  header "__stddef_max_align_t.h"
+}
+
+module opencl_c {
+  requires opencl
+  header "opencl-c.h"
+}
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/msa.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/msa.h
new file mode 100644
index 000000000..da680f5ca
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/msa.h
@@ -0,0 +1,583 @@
+/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MSA_H
+#define _MSA_H 1
+
+#if defined(__mips_msa)
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
+
+#define __msa_sll_b __builtin_msa_sll_b
+#define __msa_sll_h __builtin_msa_sll_h
+#define __msa_sll_w __builtin_msa_sll_w
+#define __msa_sll_d __builtin_msa_sll_d
+#define __msa_slli_b __builtin_msa_slli_b
+#define __msa_slli_h __builtin_msa_slli_h
+#define __msa_slli_w __builtin_msa_slli_w
+#define __msa_slli_d __builtin_msa_slli_d
+#define __msa_sra_b __builtin_msa_sra_b
+#define __msa_sra_h __builtin_msa_sra_h
+#define __msa_sra_w __builtin_msa_sra_w
+#define __msa_sra_d __builtin_msa_sra_d
+#define __msa_srai_b __builtin_msa_srai_b
+#define __msa_srai_h __builtin_msa_srai_h
+#define __msa_srai_w __builtin_msa_srai_w
+#define __msa_srai_d __builtin_msa_srai_d
+#define __msa_srar_b __builtin_msa_srar_b
+#define __msa_srar_h __builtin_msa_srar_h
+#define __msa_srar_w __builtin_msa_srar_w
+#define __msa_srar_d __builtin_msa_srar_d
+#define __msa_srari_b __builtin_msa_srari_b
+#define __msa_srari_h __builtin_msa_srari_h
+#define __msa_srari_w __builtin_msa_srari_w
+#define __msa_srari_d __builtin_msa_srari_d
+#define __msa_srl_b __builtin_msa_srl_b
+#define __msa_srl_h __builtin_msa_srl_h
+#define __msa_srl_w __builtin_msa_srl_w
+#define __msa_srl_d __builtin_msa_srl_d
+#define __msa_srli_b __builtin_msa_srli_b
+#define __msa_srli_h __builtin_msa_srli_h
+#define __msa_srli_w __builtin_msa_srli_w
+#define __msa_srli_d __builtin_msa_srli_d
+#define __msa_srlr_b __builtin_msa_srlr_b
+#define __msa_srlr_h __builtin_msa_srlr_h
+#define __msa_srlr_w __builtin_msa_srlr_w
+#define __msa_srlr_d __builtin_msa_srlr_d
+#define __msa_srlri_b __builtin_msa_srlri_b
+#define __msa_srlri_h __builtin_msa_srlri_h
+#define __msa_srlri_w __builtin_msa_srlri_w
+#define __msa_srlri_d __builtin_msa_srlri_d
+#define __msa_bclr_b __builtin_msa_bclr_b
+#define __msa_bclr_h __builtin_msa_bclr_h
+#define __msa_bclr_w __builtin_msa_bclr_w
+#define __msa_bclr_d __builtin_msa_bclr_d
+#define __msa_bclri_b __builtin_msa_bclri_b
+#define __msa_bclri_h __builtin_msa_bclri_h
+#define __msa_bclri_w __builtin_msa_bclri_w
+#define __msa_bclri_d __builtin_msa_bclri_d
+#define __msa_bset_b __builtin_msa_bset_b
+#define __msa_bset_h __builtin_msa_bset_h
+#define __msa_bset_w __builtin_msa_bset_w
+#define __msa_bset_d __builtin_msa_bset_d
+#define __msa_bseti_b __builtin_msa_bseti_b
+#define __msa_bseti_h __builtin_msa_bseti_h
+#define __msa_bseti_w __builtin_msa_bseti_w
+#define __msa_bseti_d __builtin_msa_bseti_d
+#define __msa_bneg_b __builtin_msa_bneg_b
+#define __msa_bneg_h __builtin_msa_bneg_h
+#define __msa_bneg_w __builtin_msa_bneg_w
+#define __msa_bneg_d __builtin_msa_bneg_d
+#define __msa_bnegi_b __builtin_msa_bnegi_b
+#define __msa_bnegi_h __builtin_msa_bnegi_h
+#define __msa_bnegi_w __builtin_msa_bnegi_w
+#define __msa_bnegi_d __builtin_msa_bnegi_d
+#define __msa_binsl_b __builtin_msa_binsl_b
+#define __msa_binsl_h __builtin_msa_binsl_h
+#define __msa_binsl_w __builtin_msa_binsl_w
+#define __msa_binsl_d __builtin_msa_binsl_d
+#define __msa_binsli_b __builtin_msa_binsli_b
+#define __msa_binsli_h __builtin_msa_binsli_h
+#define __msa_binsli_w __builtin_msa_binsli_w
+#define __msa_binsli_d __builtin_msa_binsli_d
+#define __msa_binsr_b __builtin_msa_binsr_b
+#define __msa_binsr_h __builtin_msa_binsr_h
+#define __msa_binsr_w __builtin_msa_binsr_w
+#define __msa_binsr_d __builtin_msa_binsr_d
+#define __msa_binsri_b __builtin_msa_binsri_b
+#define __msa_binsri_h __builtin_msa_binsri_h
+#define __msa_binsri_w __builtin_msa_binsri_w
+#define __msa_binsri_d __builtin_msa_binsri_d
+#define __msa_addv_b __builtin_msa_addv_b
+#define __msa_addv_h __builtin_msa_addv_h
+#define __msa_addv_w __builtin_msa_addv_w
+#define __msa_addv_d __builtin_msa_addv_d
+#define __msa_addvi_b __builtin_msa_addvi_b
+#define __msa_addvi_h __builtin_msa_addvi_h
+#define __msa_addvi_w __builtin_msa_addvi_w
+#define __msa_addvi_d __builtin_msa_addvi_d
+#define __msa_subv_b __builtin_msa_subv_b
+#define __msa_subv_h __builtin_msa_subv_h
+#define __msa_subv_w __builtin_msa_subv_w
+#define __msa_subv_d __builtin_msa_subv_d
+#define __msa_subvi_b __builtin_msa_subvi_b
+#define __msa_subvi_h __builtin_msa_subvi_h
+#define __msa_subvi_w __builtin_msa_subvi_w
+#define __msa_subvi_d __builtin_msa_subvi_d
+#define __msa_max_s_b __builtin_msa_max_s_b
+#define __msa_max_s_h __builtin_msa_max_s_h
+#define __msa_max_s_w __builtin_msa_max_s_w
+#define __msa_max_s_d __builtin_msa_max_s_d
+#define __msa_maxi_s_b __builtin_msa_maxi_s_b
+#define __msa_maxi_s_h __builtin_msa_maxi_s_h
+#define __msa_maxi_s_w __builtin_msa_maxi_s_w
+#define __msa_maxi_s_d __builtin_msa_maxi_s_d
+#define __msa_max_u_b __builtin_msa_max_u_b
+#define __msa_max_u_h __builtin_msa_max_u_h
+#define __msa_max_u_w __builtin_msa_max_u_w
+#define __msa_max_u_d __builtin_msa_max_u_d
+#define __msa_maxi_u_b __builtin_msa_maxi_u_b
+#define __msa_maxi_u_h __builtin_msa_maxi_u_h
+#define __msa_maxi_u_w __builtin_msa_maxi_u_w
+#define __msa_maxi_u_d __builtin_msa_maxi_u_d
+#define __msa_min_s_b __builtin_msa_min_s_b
+#define __msa_min_s_h __builtin_msa_min_s_h
+#define __msa_min_s_w __builtin_msa_min_s_w
+#define __msa_min_s_d __builtin_msa_min_s_d
+#define __msa_mini_s_b __builtin_msa_mini_s_b
+#define __msa_mini_s_h __builtin_msa_mini_s_h
+#define __msa_mini_s_w __builtin_msa_mini_s_w
+#define __msa_mini_s_d __builtin_msa_mini_s_d
+#define __msa_min_u_b __builtin_msa_min_u_b
+#define __msa_min_u_h __builtin_msa_min_u_h
+#define __msa_min_u_w __builtin_msa_min_u_w
+#define __msa_min_u_d __builtin_msa_min_u_d
+#define __msa_mini_u_b __builtin_msa_mini_u_b
+#define __msa_mini_u_h __builtin_msa_mini_u_h
+#define __msa_mini_u_w __builtin_msa_mini_u_w
+#define __msa_mini_u_d __builtin_msa_mini_u_d
+#define __msa_max_a_b __builtin_msa_max_a_b
+#define __msa_max_a_h __builtin_msa_max_a_h
+#define __msa_max_a_w __builtin_msa_max_a_w
+#define __msa_max_a_d __builtin_msa_max_a_d
+#define __msa_min_a_b __builtin_msa_min_a_b
+#define __msa_min_a_h __builtin_msa_min_a_h
+#define __msa_min_a_w __builtin_msa_min_a_w
+#define __msa_min_a_d __builtin_msa_min_a_d
+#define __msa_ceq_b __builtin_msa_ceq_b
+#define __msa_ceq_h __builtin_msa_ceq_h
+#define __msa_ceq_w __builtin_msa_ceq_w
+#define __msa_ceq_d __builtin_msa_ceq_d
+#define __msa_ceqi_b __builtin_msa_ceqi_b
+#define __msa_ceqi_h __builtin_msa_ceqi_h
+#define __msa_ceqi_w __builtin_msa_ceqi_w
+#define __msa_ceqi_d __builtin_msa_ceqi_d
+#define __msa_clt_s_b __builtin_msa_clt_s_b
+#define __msa_clt_s_h __builtin_msa_clt_s_h
+#define __msa_clt_s_w __builtin_msa_clt_s_w
+#define __msa_clt_s_d __builtin_msa_clt_s_d
+#define __msa_clti_s_b __builtin_msa_clti_s_b
+#define __msa_clti_s_h __builtin_msa_clti_s_h
+#define __msa_clti_s_w __builtin_msa_clti_s_w
+#define __msa_clti_s_d __builtin_msa_clti_s_d
+#define __msa_clt_u_b __builtin_msa_clt_u_b
+#define __msa_clt_u_h __builtin_msa_clt_u_h
+#define __msa_clt_u_w __builtin_msa_clt_u_w
+#define __msa_clt_u_d __builtin_msa_clt_u_d
+#define __msa_clti_u_b __builtin_msa_clti_u_b
+#define __msa_clti_u_h __builtin_msa_clti_u_h
+#define __msa_clti_u_w __builtin_msa_clti_u_w
+#define __msa_clti_u_d __builtin_msa_clti_u_d
+#define __msa_cle_s_b __builtin_msa_cle_s_b
+#define __msa_cle_s_h __builtin_msa_cle_s_h
+#define __msa_cle_s_w __builtin_msa_cle_s_w
+#define __msa_cle_s_d __builtin_msa_cle_s_d
+#define __msa_clei_s_b __builtin_msa_clei_s_b
+#define __msa_clei_s_h __builtin_msa_clei_s_h
+#define __msa_clei_s_w __builtin_msa_clei_s_w
+#define __msa_clei_s_d __builtin_msa_clei_s_d
+#define __msa_cle_u_b __builtin_msa_cle_u_b
+#define __msa_cle_u_h __builtin_msa_cle_u_h
+#define __msa_cle_u_w __builtin_msa_cle_u_w
+#define __msa_cle_u_d __builtin_msa_cle_u_d
+#define __msa_clei_u_b __builtin_msa_clei_u_b
+#define __msa_clei_u_h __builtin_msa_clei_u_h
+#define __msa_clei_u_w __builtin_msa_clei_u_w
+#define __msa_clei_u_d __builtin_msa_clei_u_d
+#define __msa_ld_b __builtin_msa_ld_b
+#define __msa_ld_h __builtin_msa_ld_h
+#define __msa_ld_w __builtin_msa_ld_w
+#define __msa_ld_d __builtin_msa_ld_d
+#define __msa_st_b __builtin_msa_st_b
+#define __msa_st_h __builtin_msa_st_h
+#define __msa_st_w __builtin_msa_st_w
+#define __msa_st_d __builtin_msa_st_d
+#define __msa_sat_s_b __builtin_msa_sat_s_b
+#define __msa_sat_s_h __builtin_msa_sat_s_h
+#define __msa_sat_s_w __builtin_msa_sat_s_w
+#define __msa_sat_s_d __builtin_msa_sat_s_d
+#define __msa_sat_u_b __builtin_msa_sat_u_b
+#define __msa_sat_u_h __builtin_msa_sat_u_h
+#define __msa_sat_u_w __builtin_msa_sat_u_w
+#define __msa_sat_u_d __builtin_msa_sat_u_d
+#define __msa_add_a_b __builtin_msa_add_a_b
+#define __msa_add_a_h __builtin_msa_add_a_h
+#define __msa_add_a_w __builtin_msa_add_a_w
+#define __msa_add_a_d __builtin_msa_add_a_d
+#define __msa_adds_a_b __builtin_msa_adds_a_b
+#define __msa_adds_a_h __builtin_msa_adds_a_h
+#define __msa_adds_a_w __builtin_msa_adds_a_w
+#define __msa_adds_a_d __builtin_msa_adds_a_d
+#define __msa_adds_s_b __builtin_msa_adds_s_b
+#define __msa_adds_s_h __builtin_msa_adds_s_h
+#define __msa_adds_s_w __builtin_msa_adds_s_w
+#define __msa_adds_s_d __builtin_msa_adds_s_d
+#define __msa_adds_u_b __builtin_msa_adds_u_b
+#define __msa_adds_u_h __builtin_msa_adds_u_h
+#define __msa_adds_u_w __builtin_msa_adds_u_w
+#define __msa_adds_u_d __builtin_msa_adds_u_d
+#define __msa_ave_s_b __builtin_msa_ave_s_b
+#define __msa_ave_s_h __builtin_msa_ave_s_h
+#define __msa_ave_s_w __builtin_msa_ave_s_w
+#define __msa_ave_s_d __builtin_msa_ave_s_d
+#define __msa_ave_u_b __builtin_msa_ave_u_b
+#define __msa_ave_u_h __builtin_msa_ave_u_h
+#define __msa_ave_u_w __builtin_msa_ave_u_w
+#define __msa_ave_u_d __builtin_msa_ave_u_d
+#define __msa_aver_s_b __builtin_msa_aver_s_b
+#define __msa_aver_s_h __builtin_msa_aver_s_h
+#define __msa_aver_s_w __builtin_msa_aver_s_w
+#define __msa_aver_s_d __builtin_msa_aver_s_d
+#define __msa_aver_u_b __builtin_msa_aver_u_b
+#define __msa_aver_u_h __builtin_msa_aver_u_h
+#define __msa_aver_u_w __builtin_msa_aver_u_w
+#define __msa_aver_u_d __builtin_msa_aver_u_d
+#define __msa_subs_s_b __builtin_msa_subs_s_b
+#define __msa_subs_s_h __builtin_msa_subs_s_h
+#define __msa_subs_s_w __builtin_msa_subs_s_w
+#define __msa_subs_s_d __builtin_msa_subs_s_d
+#define __msa_subs_u_b __builtin_msa_subs_u_b
+#define __msa_subs_u_h __builtin_msa_subs_u_h
+#define __msa_subs_u_w __builtin_msa_subs_u_w
+#define __msa_subs_u_d __builtin_msa_subs_u_d
+#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
+#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
+#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
+#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
+#define __msa_subsus_u_b __builtin_msa_subsus_u_b
+#define __msa_subsus_u_h __builtin_msa_subsus_u_h
+#define __msa_subsus_u_w __builtin_msa_subsus_u_w
+#define __msa_subsus_u_d __builtin_msa_subsus_u_d
+#define __msa_asub_s_b __builtin_msa_asub_s_b
+#define __msa_asub_s_h __builtin_msa_asub_s_h
+#define __msa_asub_s_w __builtin_msa_asub_s_w
+#define __msa_asub_s_d __builtin_msa_asub_s_d
+#define __msa_asub_u_b __builtin_msa_asub_u_b
+#define __msa_asub_u_h __builtin_msa_asub_u_h
+#define __msa_asub_u_w __builtin_msa_asub_u_w
+#define __msa_asub_u_d __builtin_msa_asub_u_d
+#define __msa_mulv_b __builtin_msa_mulv_b
+#define __msa_mulv_h __builtin_msa_mulv_h
+#define __msa_mulv_w __builtin_msa_mulv_w
+#define __msa_mulv_d __builtin_msa_mulv_d
+#define __msa_maddv_b __builtin_msa_maddv_b
+#define __msa_maddv_h __builtin_msa_maddv_h
+#define __msa_maddv_w __builtin_msa_maddv_w
+#define __msa_maddv_d __builtin_msa_maddv_d
+#define __msa_msubv_b __builtin_msa_msubv_b
+#define __msa_msubv_h __builtin_msa_msubv_h
+#define __msa_msubv_w __builtin_msa_msubv_w
+#define __msa_msubv_d __builtin_msa_msubv_d
+#define __msa_div_s_b __builtin_msa_div_s_b
+#define __msa_div_s_h __builtin_msa_div_s_h
+#define __msa_div_s_w __builtin_msa_div_s_w
+#define __msa_div_s_d __builtin_msa_div_s_d
+#define __msa_div_u_b __builtin_msa_div_u_b
+#define __msa_div_u_h __builtin_msa_div_u_h
+#define __msa_div_u_w __builtin_msa_div_u_w
+#define __msa_div_u_d __builtin_msa_div_u_d
+#define __msa_hadd_s_h __builtin_msa_hadd_s_h
+#define __msa_hadd_s_w __builtin_msa_hadd_s_w
+#define __msa_hadd_s_d __builtin_msa_hadd_s_d
+#define __msa_hadd_u_h __builtin_msa_hadd_u_h
+#define __msa_hadd_u_w __builtin_msa_hadd_u_w
+#define __msa_hadd_u_d __builtin_msa_hadd_u_d
+#define __msa_hsub_s_h __builtin_msa_hsub_s_h
+#define __msa_hsub_s_w __builtin_msa_hsub_s_w
+#define __msa_hsub_s_d __builtin_msa_hsub_s_d
+#define __msa_hsub_u_h __builtin_msa_hsub_u_h
+#define __msa_hsub_u_w __builtin_msa_hsub_u_w
+#define __msa_hsub_u_d __builtin_msa_hsub_u_d
+#define __msa_mod_s_b __builtin_msa_mod_s_b
+#define __msa_mod_s_h __builtin_msa_mod_s_h
+#define __msa_mod_s_w __builtin_msa_mod_s_w
+#define __msa_mod_s_d __builtin_msa_mod_s_d
+#define __msa_mod_u_b __builtin_msa_mod_u_b
+#define __msa_mod_u_h __builtin_msa_mod_u_h
+#define __msa_mod_u_w __builtin_msa_mod_u_w
+#define __msa_mod_u_d __builtin_msa_mod_u_d
+#define __msa_dotp_s_h __builtin_msa_dotp_s_h
+#define __msa_dotp_s_w __builtin_msa_dotp_s_w
+#define __msa_dotp_s_d __builtin_msa_dotp_s_d
+#define __msa_dotp_u_h __builtin_msa_dotp_u_h
+#define __msa_dotp_u_w __builtin_msa_dotp_u_w
+#define __msa_dotp_u_d __builtin_msa_dotp_u_d
+#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
+#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
+#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
+#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
+#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
+#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
+#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
+#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
+#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
+#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
+#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
+#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
+#define __msa_sld_b __builtin_msa_sld_b
+#define __msa_sld_h __builtin_msa_sld_h
+#define __msa_sld_w __builtin_msa_sld_w
+#define __msa_sld_d __builtin_msa_sld_d
+#define __msa_sldi_b __builtin_msa_sldi_b
+#define __msa_sldi_h __builtin_msa_sldi_h
+#define __msa_sldi_w __builtin_msa_sldi_w
+#define __msa_sldi_d __builtin_msa_sldi_d
+#define __msa_splat_b __builtin_msa_splat_b
+#define __msa_splat_h __builtin_msa_splat_h
+#define __msa_splat_w __builtin_msa_splat_w
+#define __msa_splat_d __builtin_msa_splat_d
+#define __msa_splati_b __builtin_msa_splati_b
+#define __msa_splati_h __builtin_msa_splati_h
+#define __msa_splati_w __builtin_msa_splati_w
+#define __msa_splati_d __builtin_msa_splati_d
+#define __msa_pckev_b __builtin_msa_pckev_b
+#define __msa_pckev_h __builtin_msa_pckev_h
+#define __msa_pckev_w __builtin_msa_pckev_w
+#define __msa_pckev_d __builtin_msa_pckev_d
+#define __msa_pckod_b __builtin_msa_pckod_b
+#define __msa_pckod_h __builtin_msa_pckod_h
+#define __msa_pckod_w __builtin_msa_pckod_w
+#define __msa_pckod_d __builtin_msa_pckod_d
+#define __msa_ilvl_b __builtin_msa_ilvl_b
+#define __msa_ilvl_h __builtin_msa_ilvl_h
+#define __msa_ilvl_w __builtin_msa_ilvl_w
+#define __msa_ilvl_d __builtin_msa_ilvl_d
+#define __msa_ilvr_b __builtin_msa_ilvr_b
+#define __msa_ilvr_h __builtin_msa_ilvr_h
+#define __msa_ilvr_w __builtin_msa_ilvr_w
+#define __msa_ilvr_d __builtin_msa_ilvr_d
+#define __msa_ilvev_b __builtin_msa_ilvev_b
+#define __msa_ilvev_h __builtin_msa_ilvev_h
+#define __msa_ilvev_w __builtin_msa_ilvev_w
+#define __msa_ilvev_d __builtin_msa_ilvev_d
+#define __msa_ilvod_b __builtin_msa_ilvod_b
+#define __msa_ilvod_h __builtin_msa_ilvod_h
+#define __msa_ilvod_w __builtin_msa_ilvod_w
+#define __msa_ilvod_d __builtin_msa_ilvod_d
+#define __msa_vshf_b __builtin_msa_vshf_b
+#define __msa_vshf_h __builtin_msa_vshf_h
+#define __msa_vshf_w __builtin_msa_vshf_w
+#define __msa_vshf_d __builtin_msa_vshf_d
+#define __msa_and_v __builtin_msa_and_v
+#define __msa_andi_b __builtin_msa_andi_b
+#define __msa_or_v __builtin_msa_or_v
+#define __msa_ori_b __builtin_msa_ori_b
+#define __msa_nor_v __builtin_msa_nor_v
+#define __msa_nori_b __builtin_msa_nori_b
+#define __msa_xor_v __builtin_msa_xor_v
+#define __msa_xori_b __builtin_msa_xori_b
+#define __msa_bmnz_v __builtin_msa_bmnz_v
+#define __msa_bmnzi_b __builtin_msa_bmnzi_b
+#define __msa_bmz_v __builtin_msa_bmz_v
+#define __msa_bmzi_b __builtin_msa_bmzi_b
+#define __msa_bsel_v __builtin_msa_bsel_v
+#define __msa_bseli_b __builtin_msa_bseli_b
+#define __msa_shf_b __builtin_msa_shf_b
+#define __msa_shf_h __builtin_msa_shf_h
+#define __msa_shf_w __builtin_msa_shf_w
+#define __msa_test_bnz_v __builtin_msa_bnz_v
+#define __msa_test_bz_v __builtin_msa_bz_v
+#define __msa_fill_b __builtin_msa_fill_b
+#define __msa_fill_h __builtin_msa_fill_h
+#define __msa_fill_w __builtin_msa_fill_w
+#define __msa_fill_d __builtin_msa_fill_d
+#define __msa_pcnt_b __builtin_msa_pcnt_b
+#define __msa_pcnt_h __builtin_msa_pcnt_h
+#define __msa_pcnt_w __builtin_msa_pcnt_w
+#define __msa_pcnt_d __builtin_msa_pcnt_d
+#define __msa_nloc_b __builtin_msa_nloc_b
+#define __msa_nloc_h __builtin_msa_nloc_h
+#define __msa_nloc_w __builtin_msa_nloc_w
+#define __msa_nloc_d __builtin_msa_nloc_d
+#define __msa_nlzc_b __builtin_msa_nlzc_b
+#define __msa_nlzc_h __builtin_msa_nlzc_h
+#define __msa_nlzc_w __builtin_msa_nlzc_w
+#define __msa_nlzc_d __builtin_msa_nlzc_d
+#define __msa_copy_s_b __builtin_msa_copy_s_b
+#define __msa_copy_s_h __builtin_msa_copy_s_h
+#define __msa_copy_s_w __builtin_msa_copy_s_w
+#define __msa_copy_s_d __builtin_msa_copy_s_d
+#define __msa_copy_u_b __builtin_msa_copy_u_b
+#define __msa_copy_u_h __builtin_msa_copy_u_h
+#define __msa_copy_u_w __builtin_msa_copy_u_w
+#define __msa_copy_u_d __builtin_msa_copy_u_d
+#define __msa_insert_b __builtin_msa_insert_b
+#define __msa_insert_h __builtin_msa_insert_h
+#define __msa_insert_w __builtin_msa_insert_w
+#define __msa_insert_d __builtin_msa_insert_d
+#define __msa_insve_b __builtin_msa_insve_b
+#define __msa_insve_h __builtin_msa_insve_h
+#define __msa_insve_w __builtin_msa_insve_w
+#define __msa_insve_d __builtin_msa_insve_d
+#define __msa_test_bnz_b __builtin_msa_bnz_b
+#define __msa_test_bnz_h __builtin_msa_bnz_h
+#define __msa_test_bnz_w __builtin_msa_bnz_w
+#define __msa_test_bnz_d __builtin_msa_bnz_d
+#define __msa_test_bz_b __builtin_msa_bz_b
+#define __msa_test_bz_h __builtin_msa_bz_h
+#define __msa_test_bz_w __builtin_msa_bz_w
+#define __msa_test_bz_d __builtin_msa_bz_d
+#define __msa_ldi_b __builtin_msa_ldi_b
+#define __msa_ldi_h __builtin_msa_ldi_h
+#define __msa_ldi_w __builtin_msa_ldi_w
+#define __msa_ldi_d __builtin_msa_ldi_d
+#define __msa_fcaf_w __builtin_msa_fcaf_w
+#define __msa_fcaf_d __builtin_msa_fcaf_d
+#define __msa_fcor_w __builtin_msa_fcor_w
+#define __msa_fcor_d __builtin_msa_fcor_d
+#define __msa_fcun_w __builtin_msa_fcun_w
+#define __msa_fcun_d __builtin_msa_fcun_d
+#define __msa_fcune_w __builtin_msa_fcune_w
+#define __msa_fcune_d __builtin_msa_fcune_d
+#define __msa_fcueq_w __builtin_msa_fcueq_w
+#define __msa_fcueq_d __builtin_msa_fcueq_d
+#define __msa_fceq_w __builtin_msa_fceq_w
+#define __msa_fceq_d __builtin_msa_fceq_d
+#define __msa_fcne_w __builtin_msa_fcne_w
+#define __msa_fcne_d __builtin_msa_fcne_d
+#define __msa_fclt_w __builtin_msa_fclt_w
+#define __msa_fclt_d __builtin_msa_fclt_d
+#define __msa_fcult_w __builtin_msa_fcult_w
+#define __msa_fcult_d __builtin_msa_fcult_d
+#define __msa_fcle_w __builtin_msa_fcle_w
+#define __msa_fcle_d __builtin_msa_fcle_d
+#define __msa_fcule_w __builtin_msa_fcule_w
+#define __msa_fcule_d __builtin_msa_fcule_d
+#define __msa_fsaf_w __builtin_msa_fsaf_w
+#define __msa_fsaf_d __builtin_msa_fsaf_d
+#define __msa_fsor_w __builtin_msa_fsor_w
+#define __msa_fsor_d __builtin_msa_fsor_d
+#define __msa_fsun_w __builtin_msa_fsun_w
+#define __msa_fsun_d __builtin_msa_fsun_d
+#define __msa_fsune_w __builtin_msa_fsune_w
+#define __msa_fsune_d __builtin_msa_fsune_d
+#define __msa_fsueq_w __builtin_msa_fsueq_w
+#define __msa_fsueq_d __builtin_msa_fsueq_d
+#define __msa_fseq_w __builtin_msa_fseq_w
+#define __msa_fseq_d __builtin_msa_fseq_d
+#define __msa_fsne_w __builtin_msa_fsne_w
+#define __msa_fsne_d __builtin_msa_fsne_d
+#define __msa_fslt_w __builtin_msa_fslt_w
+#define __msa_fslt_d __builtin_msa_fslt_d
+#define __msa_fsult_w __builtin_msa_fsult_w
+#define __msa_fsult_d __builtin_msa_fsult_d
+#define __msa_fsle_w __builtin_msa_fsle_w
+#define __msa_fsle_d __builtin_msa_fsle_d
+#define __msa_fsule_w __builtin_msa_fsule_w
+#define __msa_fsule_d __builtin_msa_fsule_d
+#define __msa_fadd_w __builtin_msa_fadd_w
+#define __msa_fadd_d __builtin_msa_fadd_d
+#define __msa_fsub_w __builtin_msa_fsub_w
+#define __msa_fsub_d __builtin_msa_fsub_d
+#define __msa_fmul_w __builtin_msa_fmul_w
+#define __msa_fmul_d __builtin_msa_fmul_d
+#define __msa_fdiv_w __builtin_msa_fdiv_w
+#define __msa_fdiv_d __builtin_msa_fdiv_d
+#define __msa_fmadd_w __builtin_msa_fmadd_w
+#define __msa_fmadd_d __builtin_msa_fmadd_d
+#define __msa_fmsub_w __builtin_msa_fmsub_w
+#define __msa_fmsub_d __builtin_msa_fmsub_d
+#define __msa_fexp2_w __builtin_msa_fexp2_w
+#define __msa_fexp2_d __builtin_msa_fexp2_d
+#define __msa_fexdo_h __builtin_msa_fexdo_h
+#define __msa_fexdo_w __builtin_msa_fexdo_w
+#define __msa_ftq_h __builtin_msa_ftq_h
+#define __msa_ftq_w __builtin_msa_ftq_w
+#define __msa_fmin_w __builtin_msa_fmin_w
+#define __msa_fmin_d __builtin_msa_fmin_d
+#define __msa_fmin_a_w __builtin_msa_fmin_a_w
+#define __msa_fmin_a_d __builtin_msa_fmin_a_d
+#define __msa_fmax_w __builtin_msa_fmax_w
+#define __msa_fmax_d __builtin_msa_fmax_d
+#define __msa_fmax_a_w __builtin_msa_fmax_a_w
+#define __msa_fmax_a_d __builtin_msa_fmax_a_d
+#define __msa_mul_q_h __builtin_msa_mul_q_h
+#define __msa_mul_q_w __builtin_msa_mul_q_w
+#define __msa_mulr_q_h __builtin_msa_mulr_q_h
+#define __msa_mulr_q_w __builtin_msa_mulr_q_w
+#define __msa_madd_q_h __builtin_msa_madd_q_h
+#define __msa_madd_q_w __builtin_msa_madd_q_w
+#define __msa_maddr_q_h __builtin_msa_maddr_q_h
+#define __msa_maddr_q_w __builtin_msa_maddr_q_w
+#define __msa_msub_q_h __builtin_msa_msub_q_h
+#define __msa_msub_q_w __builtin_msa_msub_q_w
+#define __msa_msubr_q_h __builtin_msa_msubr_q_h
+#define __msa_msubr_q_w __builtin_msa_msubr_q_w
+#define __msa_fclass_w __builtin_msa_fclass_w
+#define __msa_fclass_d __builtin_msa_fclass_d
+#define __msa_fsqrt_w __builtin_msa_fsqrt_w
+#define __msa_fsqrt_d __builtin_msa_fsqrt_d
+#define __msa_frcp_w __builtin_msa_frcp_w
+#define __msa_frcp_d __builtin_msa_frcp_d
+#define __msa_frint_w __builtin_msa_frint_w
+#define __msa_frint_d __builtin_msa_frint_d
+#define __msa_frsqrt_w __builtin_msa_frsqrt_w
+#define __msa_frsqrt_d __builtin_msa_frsqrt_d
+#define __msa_flog2_w __builtin_msa_flog2_w
+#define __msa_flog2_d __builtin_msa_flog2_d
+#define __msa_fexupl_w __builtin_msa_fexupl_w
+#define __msa_fexupl_d __builtin_msa_fexupl_d
+#define __msa_fexupr_w __builtin_msa_fexupr_w
+#define __msa_fexupr_d __builtin_msa_fexupr_d
+#define __msa_ffql_w __builtin_msa_ffql_w
+#define __msa_ffql_d __builtin_msa_ffql_d
+#define __msa_ffqr_w __builtin_msa_ffqr_w
+#define __msa_ffqr_d __builtin_msa_ffqr_d
+#define __msa_ftint_s_w __builtin_msa_ftint_s_w
+#define __msa_ftint_s_d __builtin_msa_ftint_s_d
+#define __msa_ftint_u_w __builtin_msa_ftint_u_w
+#define __msa_ftint_u_d __builtin_msa_ftint_u_d
+#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
+#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
+#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
+#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
+#define __msa_ffint_s_w __builtin_msa_ffint_s_w
+#define __msa_ffint_s_d __builtin_msa_ffint_s_d
+#define __msa_ffint_u_w __builtin_msa_ffint_u_w
+#define __msa_ffint_u_d __builtin_msa_ffint_u_d
+#define __msa_cfcmsa __builtin_msa_cfcmsa
+#define __msa_move_v __builtin_msa_move_v
+#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
+#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
+#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
+#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
+#endif /* defined(__mips_msa) */
+#endif /* _MSA_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mwaitxintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mwaitxintrin.h
new file mode 100644
index 000000000..635f2ac6c
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/mwaitxintrin.h
@@ -0,0 +1,47 @@
+/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MWAITXINTRIN_H
+#define _MWAITXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitorx((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
+{
+  __builtin_ia32_mwaitx(__extensions, __hints, __clock);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _MWAITXINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/nmmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/nmmintrin.h
new file mode 100644
index 000000000..57fec1596
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/nmmintrin.h
@@ -0,0 +1,30 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* _NMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/opencl-c.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/opencl-c.h
new file mode 100644
index 000000000..0c25d3127
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/opencl-c.h
@@ -0,0 +1,17055 @@
+//===--- opencl-c.h - OpenCL C language builtin function header -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OPENCL_H_
+#define _OPENCL_H_
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#ifndef cl_khr_depth_images
+#define cl_khr_depth_images
+#endif //cl_khr_depth_images
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#define __ovld __attribute__((overloadable))
+#define __conv __attribute__((convergent))
+
+// Optimizations
+#define __purefn __attribute__((pure))
+#define __cnfn __attribute__((const))
+
+// built-in scalar data types:
+
+/**
+ * An unsigned 8-bit integer.
+ */
+typedef unsigned char uchar;
+
+/**
+ * An unsigned 16-bit integer.
+ */
+typedef unsigned short ushort;
+
+/**
+ * An unsigned 32-bit integer.
+ */
+typedef unsigned int uint;
+
+/**
+ * An unsigned 64-bit integer.
+ */
+typedef unsigned long ulong;
+
+/**
+ * The unsigned integer type of the result of the sizeof operator. This
+ * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __SIZE_TYPE__ size_t;
+
+/**
+ * A signed integer type that is the result of subtracting two pointers.
+ * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit signed integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+/**
+* A signed integer type with the property that any valid pointer to
+* void can be converted to this type, then converted back to pointer
+* to void, and the result will compare equal to the original pointer.
+*/
+typedef __INTPTR_TYPE__ intptr_t;
+
+/**
+* An unsigned integer type with the property that any valid pointer to
+* void can be converted to this type, then converted back to pointer
+* to void, and the result will compare equal to the original pointer.
+*/
+typedef __UINTPTR_TYPE__ uintptr_t;
+
+// built-in vector data types:
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef char char8 __attribute__((ext_vector_type(8)));
+typedef char char16 __attribute__((ext_vector_type(16)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar8 __attribute__((ext_vector_type(8)));
+typedef uchar uchar16 __attribute__((ext_vector_type(16)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef short short8 __attribute__((ext_vector_type(8)));
+typedef short short16 __attribute__((ext_vector_type(16)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort8 __attribute__((ext_vector_type(8)));
+typedef ushort ushort16 __attribute__((ext_vector_type(16)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef uint uint8 __attribute__((ext_vector_type(8)));
+typedef uint uint16 __attribute__((ext_vector_type(16)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+typedef long long8 __attribute__((ext_vector_type(8)));
+typedef long long16 __attribute__((ext_vector_type(16)));
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
+typedef ulong ulong3 __attribute__((ext_vector_type(3)));
+typedef ulong ulong4 __attribute__((ext_vector_type(4)));
+typedef ulong ulong8 __attribute__((ext_vector_type(8)));
+typedef ulong ulong16 __attribute__((ext_vector_type(16)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef float float16 __attribute__((ext_vector_type(16)));
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+typedef half half2 __attribute__((ext_vector_type(2)));
+typedef half half3 __attribute__((ext_vector_type(3)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+typedef half half8 __attribute__((ext_vector_type(8)));
+typedef half half16 __attribute__((ext_vector_type(16)));
+#endif
+#ifdef cl_khr_fp64
+#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef double double3 __attribute__((ext_vector_type(3)));
+typedef double double4 __attribute__((ext_vector_type(4)));
+typedef double double8 __attribute__((ext_vector_type(8)));
+typedef double double16 __attribute__((ext_vector_type(16)));
+#endif
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define NULL ((void*)0)
+#endif
+
+/**
+ * Value of maximum non-infinite single-precision floating-point
+ * number.
+ */
+#define MAXFLOAT 0x1.fffffep127f
+
+/**
+ * A positive float constant expression. HUGE_VALF evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VALF (__builtin_huge_valf())
+
+/**
+ * A positive double constant expression. HUGE_VAL evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VAL (__builtin_huge_val())
+
+/**
+ * A constant expression of type float representing positive or
+ * unsigned infinity.
+ */
+#define INFINITY (__builtin_inff())
+
+/**
+ * A constant expression of type float representing a quiet NaN.
+ */
+#define NAN as_float(INT_MAX)
+
+#define FP_ILOGB0    INT_MIN
+#define FP_ILOGBNAN    INT_MAX
+
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define M_E_F         2.71828182845904523536028747135266250f
+#define M_LOG2E_F     1.44269504088896340735992468100189214f
+#define M_LOG10E_F    0.434294481903251827651128918916605082f
+#define M_LN2_F       0.693147180559945309417232121458176568f
+#define M_LN10_F      2.30258509299404568401799145468436421f
+#define M_PI_F        3.14159265358979323846264338327950288f
+#define M_PI_2_F      1.57079632679489661923132169163975144f
+#define M_PI_4_F      0.785398163397448309615660845819875721f
+#define M_1_PI_F      0.318309886183790671537767526745028724f
+#define M_2_PI_F      0.636619772367581343075535053490057448f
+#define M_2_SQRTPI_F  1.12837916709551257389615890312154517f
+#define M_SQRT2_F     1.41421356237309504880168872420969808f
+#define M_SQRT1_2_F   0.707106781186547524400844362104849039f
+
+#define DBL_DIG 15
+#define DBL_MANT_DIG 53
+#define DBL_MAX_10_EXP +308
+#define DBL_MAX_EXP +1024
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP -1021
+#define DBL_RADIX 2
+#define DBL_MAX 0x1.fffffffffffffp1023
+#define DBL_MIN 0x1.0p-1022
+#define DBL_EPSILON 0x1.0p-52
+
+#define M_E           0x1.5bf0a8b145769p+1
+#define M_LOG2E       0x1.71547652b82fep+0
+#define M_LOG10E      0x1.bcb7b1526e50ep-2
+#define M_LN2         0x1.62e42fefa39efp-1
+#define M_LN10        0x1.26bb1bbb55516p+1
+#define M_PI          0x1.921fb54442d18p+1
+#define M_PI_2        0x1.921fb54442d18p+0
+#define M_PI_4        0x1.921fb54442d18p-1
+#define M_1_PI        0x1.45f306dc9c883p-2
+#define M_2_PI        0x1.45f306dc9c883p-1
+#define M_2_SQRTPI    0x1.20dd750429b6dp+0
+#define M_SQRT2       0x1.6a09e667f3bcdp+0
+#define M_SQRT1_2     0x1.6a09e667f3bcdp-1
+
+#ifdef cl_khr_fp16
+
+#define HALF_DIG 3
+#define HALF_MANT_DIG 11
+#define HALF_MAX_10_EXP +4
+#define HALF_MAX_EXP +16
+#define HALF_MIN_10_EXP -4
+#define HALF_MIN_EXP -13
+#define HALF_RADIX 2
+#define HALF_MAX ((0x1.ffcp15h))
+#define HALF_MIN ((0x1.0p-14h))
+#define HALF_EPSILON ((0x1.0p-10h))
+
+#define M_E_H         2.71828182845904523536028747135266250h
+#define M_LOG2E_H     1.44269504088896340735992468100189214h
+#define M_LOG10E_H    0.434294481903251827651128918916605082h
+#define M_LN2_H       0.693147180559945309417232121458176568h
+#define M_LN10_H      2.30258509299404568401799145468436421h
+#define M_PI_H        3.14159265358979323846264338327950288h
+#define M_PI_2_H      1.57079632679489661923132169163975144h
+#define M_PI_4_H      0.785398163397448309615660845819875721h
+#define M_1_PI_H      0.318309886183790671537767526745028724h
+#define M_2_PI_H      0.636619772367581343075535053490057448h
+#define M_2_SQRTPI_H  1.12837916709551257389615890312154517h
+#define M_SQRT2_H     1.41421356237309504880168872420969808h
+#define M_SQRT1_2_H   0.707106781186547524400844362104849039h
+
+#endif //cl_khr_fp16
+
+#define CHAR_BIT    8
+#define SCHAR_MAX  127
+#define SCHAR_MIN  (-128)
+#define UCHAR_MAX  255
+#define CHAR_MAX  SCHAR_MAX
+#define CHAR_MIN  SCHAR_MIN
+#define USHRT_MAX  65535
+#define SHRT_MAX  32767
+#define SHRT_MIN  (-32768)
+#define UINT_MAX  0xffffffff
+#define INT_MAX    2147483647
+#define INT_MIN    (-2147483647-1)
+#define ULONG_MAX  0xffffffffffffffffUL
+#define LONG_MAX  0x7fffffffffffffffL
+#define LONG_MIN  (-0x7fffffffffffffffL-1)
+
+// OpenCL v1.1/1.2/2.0 s6.2.3 - Explicit conversions
+
+char __ovld __cnfn convert_char_rte(char);
+char __ovld __cnfn convert_char_sat_rte(char);
+char __ovld __cnfn convert_char_rtz(char);
+char __ovld __cnfn convert_char_sat_rtz(char);
+char __ovld __cnfn convert_char_rtp(char);
+char __ovld __cnfn convert_char_sat_rtp(char);
+char __ovld __cnfn convert_char_rtn(char);
+char __ovld __cnfn convert_char_sat_rtn(char);
+char __ovld __cnfn convert_char(char);
+char __ovld __cnfn convert_char_sat(char);
+char __ovld __cnfn convert_char_rte(uchar);
+char __ovld __cnfn convert_char_sat_rte(uchar);
+char __ovld __cnfn convert_char_rtz(uchar);
+char __ovld __cnfn convert_char_sat_rtz(uchar);
+char __ovld __cnfn convert_char_rtp(uchar);
+char __ovld __cnfn convert_char_sat_rtp(uchar);
+char __ovld __cnfn convert_char_rtn(uchar);
+char __ovld __cnfn convert_char_sat_rtn(uchar);
+char __ovld __cnfn convert_char(uchar);
+char __ovld __cnfn convert_char_sat(uchar);
+char __ovld __cnfn convert_char_rte(short);
+char __ovld __cnfn convert_char_sat_rte(short);
+char __ovld __cnfn convert_char_rtz(short);
+char __ovld __cnfn convert_char_sat_rtz(short);
+char __ovld __cnfn convert_char_rtp(short);
+char __ovld __cnfn convert_char_sat_rtp(short);
+char __ovld __cnfn convert_char_rtn(short);
+char __ovld __cnfn convert_char_sat_rtn(short);
+char __ovld __cnfn convert_char(short);
+char __ovld __cnfn convert_char_sat(short);
+char __ovld __cnfn convert_char_rte(ushort);
+char __ovld __cnfn convert_char_sat_rte(ushort);
+char __ovld __cnfn convert_char_rtz(ushort);
+char __ovld __cnfn convert_char_sat_rtz(ushort);
+char __ovld __cnfn convert_char_rtp(ushort);
+char __ovld __cnfn convert_char_sat_rtp(ushort);
+char __ovld __cnfn convert_char_rtn(ushort);
+char __ovld __cnfn convert_char_sat_rtn(ushort);
+char __ovld __cnfn convert_char(ushort);
+char __ovld __cnfn convert_char_sat(ushort);
+char __ovld __cnfn convert_char_rte(int);
+char __ovld __cnfn convert_char_sat_rte(int);
+char __ovld __cnfn convert_char_rtz(int);
+char __ovld __cnfn convert_char_sat_rtz(int);
+char __ovld __cnfn convert_char_rtp(int);
+char __ovld __cnfn convert_char_sat_rtp(int);
+char __ovld __cnfn convert_char_rtn(int);
+char __ovld __cnfn convert_char_sat_rtn(int);
+char __ovld __cnfn convert_char(int);
+char __ovld __cnfn convert_char_sat(int);
+char __ovld __cnfn convert_char_rte(uint);
+char __ovld __cnfn convert_char_sat_rte(uint);
+char __ovld __cnfn convert_char_rtz(uint);
+char __ovld __cnfn convert_char_sat_rtz(uint);
+char __ovld __cnfn convert_char_rtp(uint);
+char __ovld __cnfn convert_char_sat_rtp(uint);
+char __ovld __cnfn convert_char_rtn(uint);
+char __ovld __cnfn convert_char_sat_rtn(uint);
+char __ovld __cnfn convert_char(uint);
+char __ovld __cnfn convert_char_sat(uint);
+char __ovld __cnfn convert_char_rte(long);
+char __ovld __cnfn convert_char_sat_rte(long);
+char __ovld __cnfn convert_char_rtz(long);
+char __ovld __cnfn convert_char_sat_rtz(long);
+char __ovld __cnfn convert_char_rtp(long);
+char __ovld __cnfn convert_char_sat_rtp(long);
+char __ovld __cnfn convert_char_rtn(long);
+char __ovld __cnfn convert_char_sat_rtn(long);
+char __ovld __cnfn convert_char(long);
+char __ovld __cnfn convert_char_sat(long);
+char __ovld __cnfn convert_char_rte(ulong);
+char __ovld __cnfn convert_char_sat_rte(ulong);
+char __ovld __cnfn convert_char_rtz(ulong);
+char __ovld __cnfn convert_char_sat_rtz(ulong);
+char __ovld __cnfn convert_char_rtp(ulong);
+char __ovld __cnfn convert_char_sat_rtp(ulong);
+char __ovld __cnfn convert_char_rtn(ulong);
+char __ovld __cnfn convert_char_sat_rtn(ulong);
+char __ovld __cnfn convert_char(ulong);
+char __ovld __cnfn convert_char_sat(ulong);
+char __ovld __cnfn convert_char_rte(float);
+char __ovld __cnfn convert_char_sat_rte(float);
+char __ovld __cnfn convert_char_rtz(float);
+char __ovld __cnfn convert_char_sat_rtz(float);
+char __ovld __cnfn convert_char_rtp(float);
+char __ovld __cnfn convert_char_sat_rtp(float);
+char __ovld __cnfn convert_char_rtn(float);
+char __ovld __cnfn convert_char_sat_rtn(float);
+char __ovld __cnfn convert_char(float);
+char __ovld __cnfn convert_char_sat(float);
+uchar __ovld __cnfn convert_uchar_rte(char);
+uchar __ovld __cnfn convert_uchar_sat_rte(char);
+uchar __ovld __cnfn convert_uchar_rtz(char);
+uchar __ovld __cnfn convert_uchar_sat_rtz(char);
+uchar __ovld __cnfn convert_uchar_rtp(char);
+uchar __ovld __cnfn convert_uchar_sat_rtp(char);
+uchar __ovld __cnfn convert_uchar_rtn(char);
+uchar __ovld __cnfn convert_uchar_sat_rtn(char);
+uchar __ovld __cnfn convert_uchar(char);
+uchar __ovld __cnfn convert_uchar_sat(char);
+uchar __ovld __cnfn convert_uchar_rte(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rte(uchar);
+uchar __ovld __cnfn convert_uchar_rtz(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtz(uchar);
+uchar __ovld __cnfn convert_uchar_rtp(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtp(uchar);
+uchar __ovld __cnfn convert_uchar_rtn(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtn(uchar);
+uchar __ovld __cnfn convert_uchar(uchar);
+uchar __ovld __cnfn convert_uchar_sat(uchar);
+uchar __ovld __cnfn convert_uchar_rte(short);
+uchar __ovld __cnfn convert_uchar_sat_rte(short);
+uchar __ovld __cnfn convert_uchar_rtz(short);
+uchar __ovld __cnfn convert_uchar_sat_rtz(short);
+uchar __ovld __cnfn convert_uchar_rtp(short);
+uchar __ovld __cnfn convert_uchar_sat_rtp(short);
+uchar __ovld __cnfn convert_uchar_rtn(short);
+uchar __ovld __cnfn convert_uchar_sat_rtn(short);
+uchar __ovld __cnfn convert_uchar(short);
+uchar __ovld __cnfn convert_uchar_sat(short);
+uchar __ovld __cnfn convert_uchar_rte(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rte(ushort);
+uchar __ovld __cnfn convert_uchar_rtz(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtz(ushort);
+uchar __ovld __cnfn convert_uchar_rtp(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtp(ushort);
+uchar __ovld __cnfn convert_uchar_rtn(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtn(ushort);
+uchar __ovld __cnfn convert_uchar(ushort);
+uchar __ovld __cnfn convert_uchar_sat(ushort);
+uchar __ovld __cnfn convert_uchar_rte(int);
+uchar __ovld __cnfn convert_uchar_sat_rte(int);
+uchar __ovld __cnfn convert_uchar_rtz(int);
+uchar __ovld __cnfn convert_uchar_sat_rtz(int);
+uchar __ovld __cnfn convert_uchar_rtp(int);
+uchar __ovld __cnfn convert_uchar_sat_rtp(int);
+uchar __ovld __cnfn convert_uchar_rtn(int);
+uchar __ovld __cnfn convert_uchar_sat_rtn(int);
+uchar __ovld __cnfn convert_uchar(int);
+uchar __ovld __cnfn convert_uchar_sat(int);
+uchar __ovld __cnfn convert_uchar_rte(uint);
+uchar __ovld __cnfn convert_uchar_sat_rte(uint);
+uchar __ovld __cnfn convert_uchar_rtz(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtz(uint);
+uchar __ovld __cnfn convert_uchar_rtp(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtp(uint);
+uchar __ovld __cnfn convert_uchar_rtn(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtn(uint);
+uchar __ovld __cnfn convert_uchar(uint);
+uchar __ovld __cnfn convert_uchar_sat(uint);
+uchar __ovld __cnfn convert_uchar_rte(long);
+uchar __ovld __cnfn convert_uchar_sat_rte(long);
+uchar __ovld __cnfn convert_uchar_rtz(long);
+uchar __ovld __cnfn convert_uchar_sat_rtz(long);
+uchar __ovld __cnfn convert_uchar_rtp(long);
+uchar __ovld __cnfn convert_uchar_sat_rtp(long);
+uchar __ovld __cnfn convert_uchar_rtn(long);
+uchar __ovld __cnfn convert_uchar_sat_rtn(long);
+uchar __ovld __cnfn convert_uchar(long);
+uchar __ovld __cnfn convert_uchar_sat(long);
+uchar __ovld __cnfn convert_uchar_rte(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rte(ulong);
+uchar __ovld __cnfn convert_uchar_rtz(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtz(ulong);
+uchar __ovld __cnfn convert_uchar_rtp(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtp(ulong);
+uchar __ovld __cnfn convert_uchar_rtn(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtn(ulong);
+uchar __ovld __cnfn convert_uchar(ulong);
+uchar __ovld __cnfn convert_uchar_sat(ulong);
+uchar __ovld __cnfn convert_uchar_rte(float);
+uchar __ovld __cnfn convert_uchar_sat_rte(float);
+uchar __ovld __cnfn convert_uchar_rtz(float);
+uchar __ovld __cnfn convert_uchar_sat_rtz(float);
+uchar __ovld __cnfn convert_uchar_rtp(float);
+uchar __ovld __cnfn convert_uchar_sat_rtp(float);
+uchar __ovld __cnfn convert_uchar_rtn(float);
+uchar __ovld __cnfn convert_uchar_sat_rtn(float);
+uchar __ovld __cnfn convert_uchar(float);
+uchar __ovld __cnfn convert_uchar_sat(float);
+
+short __ovld __cnfn convert_short_rte(char);
+short __ovld __cnfn convert_short_sat_rte(char);
+short __ovld __cnfn convert_short_rtz(char);
+short __ovld __cnfn convert_short_sat_rtz(char);
+short __ovld __cnfn convert_short_rtp(char);
+short __ovld __cnfn convert_short_sat_rtp(char);
+short __ovld __cnfn convert_short_rtn(char);
+short __ovld __cnfn convert_short_sat_rtn(char);
+short __ovld __cnfn convert_short(char);
+short __ovld __cnfn convert_short_sat(char);
+short __ovld __cnfn convert_short_rte(uchar);
+short __ovld __cnfn convert_short_sat_rte(uchar);
+short __ovld __cnfn convert_short_rtz(uchar);
+short __ovld __cnfn convert_short_sat_rtz(uchar);
+short __ovld __cnfn convert_short_rtp(uchar);
+short __ovld __cnfn convert_short_sat_rtp(uchar);
+short __ovld __cnfn convert_short_rtn(uchar);
+short __ovld __cnfn convert_short_sat_rtn(uchar);
+short __ovld __cnfn convert_short(uchar);
+short __ovld __cnfn convert_short_sat(uchar);
+short __ovld __cnfn convert_short_rte(short);
+short __ovld __cnfn convert_short_sat_rte(short);
+short __ovld __cnfn convert_short_rtz(short);
+short __ovld __cnfn convert_short_sat_rtz(short);
+short __ovld __cnfn convert_short_rtp(short);
+short __ovld __cnfn convert_short_sat_rtp(short);
+short __ovld __cnfn convert_short_rtn(short);
+short __ovld __cnfn convert_short_sat_rtn(short);
+short __ovld __cnfn convert_short(short);
+short __ovld __cnfn convert_short_sat(short);
+short __ovld __cnfn convert_short_rte(ushort);
+short __ovld __cnfn convert_short_sat_rte(ushort);
+short __ovld __cnfn convert_short_rtz(ushort);
+short __ovld __cnfn convert_short_sat_rtz(ushort);
+short __ovld __cnfn convert_short_rtp(ushort);
+short __ovld __cnfn convert_short_sat_rtp(ushort);
+short __ovld __cnfn convert_short_rtn(ushort);
+short __ovld __cnfn convert_short_sat_rtn(ushort);
+short __ovld __cnfn convert_short(ushort);
+short __ovld __cnfn convert_short_sat(ushort);
+short __ovld __cnfn convert_short_rte(int);
+short __ovld __cnfn convert_short_sat_rte(int);
+short __ovld __cnfn convert_short_rtz(int);
+short __ovld __cnfn convert_short_sat_rtz(int);
+short __ovld __cnfn convert_short_rtp(int);
+short __ovld __cnfn convert_short_sat_rtp(int);
+short __ovld __cnfn convert_short_rtn(int);
+short __ovld __cnfn convert_short_sat_rtn(int);
+short __ovld __cnfn convert_short(int);
+short __ovld __cnfn convert_short_sat(int);
+short __ovld __cnfn convert_short_rte(uint);
+short __ovld __cnfn convert_short_sat_rte(uint);
+short __ovld __cnfn convert_short_rtz(uint);
+short __ovld __cnfn convert_short_sat_rtz(uint);
+short __ovld __cnfn convert_short_rtp(uint);
+short __ovld __cnfn convert_short_sat_rtp(uint);
+short __ovld __cnfn convert_short_rtn(uint);
+short __ovld __cnfn convert_short_sat_rtn(uint);
+short __ovld __cnfn convert_short(uint);
+short __ovld __cnfn convert_short_sat(uint);
+short __ovld __cnfn convert_short_rte(long);
+short __ovld __cnfn convert_short_sat_rte(long);
+short __ovld __cnfn convert_short_rtz(long);
+short __ovld __cnfn convert_short_sat_rtz(long);
+short __ovld __cnfn convert_short_rtp(long);
+short __ovld __cnfn convert_short_sat_rtp(long);
+short __ovld __cnfn convert_short_rtn(long);
+short __ovld __cnfn convert_short_sat_rtn(long);
+short __ovld __cnfn convert_short(long);
+short __ovld __cnfn convert_short_sat(long);
+short __ovld __cnfn convert_short_rte(ulong);
+short __ovld __cnfn convert_short_sat_rte(ulong);
+short __ovld __cnfn convert_short_rtz(ulong);
+short __ovld __cnfn convert_short_sat_rtz(ulong);
+short __ovld __cnfn convert_short_rtp(ulong);
+short __ovld __cnfn convert_short_sat_rtp(ulong);
+short __ovld __cnfn convert_short_rtn(ulong);
+short __ovld __cnfn convert_short_sat_rtn(ulong);
+short __ovld __cnfn convert_short(ulong);
+short __ovld __cnfn convert_short_sat(ulong);
+short __ovld __cnfn convert_short_rte(float);
+short __ovld __cnfn convert_short_sat_rte(float);
+short __ovld __cnfn convert_short_rtz(float);
+short __ovld __cnfn convert_short_sat_rtz(float);
+short __ovld __cnfn convert_short_rtp(float);
+short __ovld __cnfn convert_short_sat_rtp(float);
+short __ovld __cnfn convert_short_rtn(float);
+short __ovld __cnfn convert_short_sat_rtn(float);
+short __ovld __cnfn convert_short(float);
+short __ovld __cnfn convert_short_sat(float);
+ushort __ovld __cnfn convert_ushort_rte(char);
+ushort __ovld __cnfn convert_ushort_sat_rte(char);
+ushort __ovld __cnfn convert_ushort_rtz(char);
+ushort __ovld __cnfn convert_ushort_sat_rtz(char);
+ushort __ovld __cnfn convert_ushort_rtp(char);
+ushort __ovld __cnfn convert_ushort_sat_rtp(char);
+ushort __ovld __cnfn convert_ushort_rtn(char);
+ushort __ovld __cnfn convert_ushort_sat_rtn(char);
+ushort __ovld __cnfn convert_ushort(char);
+ushort __ovld __cnfn convert_ushort_sat(char);
+ushort __ovld __cnfn convert_ushort_rte(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rte(uchar);
+ushort __ovld __cnfn convert_ushort_rtz(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtz(uchar);
+ushort __ovld __cnfn convert_ushort_rtp(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtp(uchar);
+ushort __ovld __cnfn convert_ushort_rtn(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtn(uchar);
+ushort __ovld __cnfn convert_ushort(uchar);
+ushort __ovld __cnfn convert_ushort_sat(uchar);
+ushort __ovld __cnfn convert_ushort_rte(short);
+ushort __ovld __cnfn convert_ushort_sat_rte(short);
+ushort __ovld __cnfn convert_ushort_rtz(short);
+ushort __ovld __cnfn convert_ushort_sat_rtz(short);
+ushort __ovld __cnfn convert_ushort_rtp(short);
+ushort __ovld __cnfn convert_ushort_sat_rtp(short);
+ushort __ovld __cnfn convert_ushort_rtn(short);
+ushort __ovld __cnfn convert_ushort_sat_rtn(short);
+ushort __ovld __cnfn convert_ushort(short);
+ushort __ovld __cnfn convert_ushort_sat(short);
+ushort __ovld __cnfn convert_ushort_rte(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rte(ushort);
+ushort __ovld __cnfn convert_ushort_rtz(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtz(ushort);
+ushort __ovld __cnfn convert_ushort_rtp(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtp(ushort);
+ushort __ovld __cnfn convert_ushort_rtn(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtn(ushort);
+ushort __ovld __cnfn convert_ushort(ushort);
+ushort __ovld __cnfn convert_ushort_sat(ushort);
+ushort __ovld __cnfn convert_ushort_rte(int);
+ushort __ovld __cnfn convert_ushort_sat_rte(int);
+ushort __ovld __cnfn convert_ushort_rtz(int);
+ushort __ovld __cnfn convert_ushort_sat_rtz(int);
+ushort __ovld __cnfn convert_ushort_rtp(int);
+ushort __ovld __cnfn convert_ushort_sat_rtp(int);
+ushort __ovld __cnfn convert_ushort_rtn(int);
+ushort __ovld __cnfn convert_ushort_sat_rtn(int);
+ushort __ovld __cnfn convert_ushort(int);
+ushort __ovld __cnfn convert_ushort_sat(int);
+ushort __ovld __cnfn convert_ushort_rte(uint);
+ushort __ovld __cnfn convert_ushort_sat_rte(uint);
+ushort __ovld __cnfn convert_ushort_rtz(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtz(uint);
+ushort __ovld __cnfn convert_ushort_rtp(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtp(uint);
+ushort __ovld __cnfn convert_ushort_rtn(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtn(uint);
+ushort __ovld __cnfn convert_ushort(uint);
+ushort __ovld __cnfn convert_ushort_sat(uint);
+ushort __ovld __cnfn convert_ushort_rte(long);
+ushort __ovld __cnfn convert_ushort_sat_rte(long);
+ushort __ovld __cnfn convert_ushort_rtz(long);
+ushort __ovld __cnfn convert_ushort_sat_rtz(long);
+ushort __ovld __cnfn convert_ushort_rtp(long);
+ushort __ovld __cnfn convert_ushort_sat_rtp(long);
+ushort __ovld __cnfn convert_ushort_rtn(long);
+ushort __ovld __cnfn convert_ushort_sat_rtn(long);
+ushort __ovld __cnfn convert_ushort(long);
+ushort __ovld __cnfn convert_ushort_sat(long);
+ushort __ovld __cnfn convert_ushort_rte(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rte(ulong);
+ushort __ovld __cnfn convert_ushort_rtz(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtz(ulong);
+ushort __ovld __cnfn convert_ushort_rtp(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtp(ulong);
+ushort __ovld __cnfn convert_ushort_rtn(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtn(ulong);
+ushort __ovld __cnfn convert_ushort(ulong);
+ushort __ovld __cnfn convert_ushort_sat(ulong);
+ushort __ovld __cnfn convert_ushort_rte(float);
+ushort __ovld __cnfn convert_ushort_sat_rte(float);
+ushort __ovld __cnfn convert_ushort_rtz(float);
+ushort __ovld __cnfn convert_ushort_sat_rtz(float);
+ushort __ovld __cnfn convert_ushort_rtp(float);
+ushort __ovld __cnfn convert_ushort_sat_rtp(float);
+ushort __ovld __cnfn convert_ushort_rtn(float);
+ushort __ovld __cnfn convert_ushort_sat_rtn(float);
+ushort __ovld __cnfn convert_ushort(float);
+ushort __ovld __cnfn convert_ushort_sat(float);
+int __ovld __cnfn convert_int_rte(char);
+int __ovld __cnfn convert_int_sat_rte(char);
+int __ovld __cnfn convert_int_rtz(char);
+int __ovld __cnfn convert_int_sat_rtz(char);
+int __ovld __cnfn convert_int_rtp(char);
+int __ovld __cnfn convert_int_sat_rtp(char);
+int __ovld __cnfn convert_int_rtn(char);
+int __ovld __cnfn convert_int_sat_rtn(char);
+int __ovld __cnfn convert_int(char);
+int __ovld __cnfn convert_int_sat(char);
+int __ovld __cnfn convert_int_rte(uchar);
+int __ovld __cnfn convert_int_sat_rte(uchar);
+int __ovld __cnfn convert_int_rtz(uchar);
+int __ovld __cnfn convert_int_sat_rtz(uchar);
+int __ovld __cnfn convert_int_rtp(uchar);
+int __ovld __cnfn convert_int_sat_rtp(uchar);
+int __ovld __cnfn convert_int_rtn(uchar);
+int __ovld __cnfn convert_int_sat_rtn(uchar);
+int __ovld __cnfn convert_int(uchar);
+int __ovld __cnfn convert_int_sat(uchar);
+int __ovld __cnfn convert_int_rte(short);
+int __ovld __cnfn convert_int_sat_rte(short);
+int __ovld __cnfn convert_int_rtz(short);
+int __ovld __cnfn convert_int_sat_rtz(short);
+int __ovld __cnfn convert_int_rtp(short);
+int __ovld __cnfn convert_int_sat_rtp(short);
+int __ovld __cnfn convert_int_rtn(short);
+int __ovld __cnfn convert_int_sat_rtn(short);
+int __ovld __cnfn convert_int(short);
+int __ovld __cnfn convert_int_sat(short);
+int __ovld __cnfn convert_int_rte(ushort);
+int __ovld __cnfn convert_int_sat_rte(ushort);
+int __ovld __cnfn convert_int_rtz(ushort);
+int __ovld __cnfn convert_int_sat_rtz(ushort);
+int __ovld __cnfn convert_int_rtp(ushort);
+int __ovld __cnfn convert_int_sat_rtp(ushort);
+int __ovld __cnfn convert_int_rtn(ushort);
+int __ovld __cnfn convert_int_sat_rtn(ushort);
+int __ovld __cnfn convert_int(ushort);
+int __ovld __cnfn convert_int_sat(ushort);
+int __ovld __cnfn convert_int_rte(int);
+int __ovld __cnfn convert_int_sat_rte(int);
+int __ovld __cnfn convert_int_rtz(int);
+int __ovld __cnfn convert_int_sat_rtz(int);
+int __ovld __cnfn convert_int_rtp(int);
+int __ovld __cnfn convert_int_sat_rtp(int);
+int __ovld __cnfn convert_int_rtn(int);
+int __ovld __cnfn convert_int_sat_rtn(int);
+int __ovld __cnfn convert_int(int);
+int __ovld __cnfn convert_int_sat(int);
+int __ovld __cnfn convert_int_rte(uint);
+int __ovld __cnfn convert_int_sat_rte(uint);
+int __ovld __cnfn convert_int_rtz(uint);
+int __ovld __cnfn convert_int_sat_rtz(uint);
+int __ovld __cnfn convert_int_rtp(uint);
+int __ovld __cnfn convert_int_sat_rtp(uint);
+int __ovld __cnfn convert_int_rtn(uint);
+int __ovld __cnfn convert_int_sat_rtn(uint);
+int __ovld __cnfn convert_int(uint);
+int __ovld __cnfn convert_int_sat(uint);
+int __ovld __cnfn convert_int_rte(long);
+int __ovld __cnfn convert_int_sat_rte(long);
+int __ovld __cnfn convert_int_rtz(long);
+int __ovld __cnfn convert_int_sat_rtz(long);
+int __ovld __cnfn convert_int_rtp(long);
+int __ovld __cnfn convert_int_sat_rtp(long);
+int __ovld __cnfn convert_int_rtn(long);
+int __ovld __cnfn convert_int_sat_rtn(long);
+int __ovld __cnfn convert_int(long);
+int __ovld __cnfn convert_int_sat(long);
+int __ovld __cnfn convert_int_rte(ulong);
+int __ovld __cnfn convert_int_sat_rte(ulong);
+int __ovld __cnfn convert_int_rtz(ulong);
+int __ovld __cnfn convert_int_sat_rtz(ulong);
+int __ovld __cnfn convert_int_rtp(ulong);
+int __ovld __cnfn convert_int_sat_rtp(ulong);
+int __ovld __cnfn convert_int_rtn(ulong);
+int __ovld __cnfn convert_int_sat_rtn(ulong);
+int __ovld __cnfn convert_int(ulong);
+int __ovld __cnfn convert_int_sat(ulong);
+int __ovld __cnfn convert_int_rte(float);
+int __ovld __cnfn convert_int_sat_rte(float);
+int __ovld __cnfn convert_int_rtz(float);
+int __ovld __cnfn convert_int_sat_rtz(float);
+int __ovld __cnfn convert_int_rtp(float);
+int __ovld __cnfn convert_int_sat_rtp(float);
+int __ovld __cnfn convert_int_rtn(float);
+int __ovld __cnfn convert_int_sat_rtn(float);
+int __ovld __cnfn convert_int(float);
+int __ovld __cnfn convert_int_sat(float);
+uint __ovld __cnfn convert_uint_rte(char);
+uint __ovld __cnfn convert_uint_sat_rte(char);
+uint __ovld __cnfn convert_uint_rtz(char);
+uint __ovld __cnfn convert_uint_sat_rtz(char);
+uint __ovld __cnfn convert_uint_rtp(char);
+uint __ovld __cnfn convert_uint_sat_rtp(char);
+uint __ovld __cnfn convert_uint_rtn(char);
+uint __ovld __cnfn convert_uint_sat_rtn(char);
+uint __ovld __cnfn convert_uint(char);
+uint __ovld __cnfn convert_uint_sat(char);
+uint __ovld __cnfn convert_uint_rte(uchar);
+uint __ovld __cnfn convert_uint_sat_rte(uchar);
+uint __ovld __cnfn convert_uint_rtz(uchar);
+uint __ovld __cnfn convert_uint_sat_rtz(uchar);
+uint __ovld __cnfn convert_uint_rtp(uchar);
+uint __ovld __cnfn convert_uint_sat_rtp(uchar);
+uint __ovld __cnfn convert_uint_rtn(uchar);
+uint __ovld __cnfn convert_uint_sat_rtn(uchar);
+uint __ovld __cnfn convert_uint(uchar);
+uint __ovld __cnfn convert_uint_sat(uchar);
+uint __ovld __cnfn convert_uint_rte(short);
+uint __ovld __cnfn convert_uint_sat_rte(short);
+uint __ovld __cnfn convert_uint_rtz(short);
+uint __ovld __cnfn convert_uint_sat_rtz(short);
+uint __ovld __cnfn convert_uint_rtp(short);
+uint __ovld __cnfn convert_uint_sat_rtp(short);
+uint __ovld __cnfn convert_uint_rtn(short);
+uint __ovld __cnfn convert_uint_sat_rtn(short);
+uint __ovld __cnfn convert_uint(short);
+uint __ovld __cnfn convert_uint_sat(short);
+uint __ovld __cnfn convert_uint_rte(ushort);
+uint __ovld __cnfn convert_uint_sat_rte(ushort);
+uint __ovld __cnfn convert_uint_rtz(ushort);
+uint __ovld __cnfn convert_uint_sat_rtz(ushort);
+uint __ovld __cnfn convert_uint_rtp(ushort);
+uint __ovld __cnfn convert_uint_sat_rtp(ushort);
+uint __ovld __cnfn convert_uint_rtn(ushort);
+uint __ovld __cnfn convert_uint_sat_rtn(ushort);
+uint __ovld __cnfn convert_uint(ushort);
+uint __ovld __cnfn convert_uint_sat(ushort);
+uint __ovld __cnfn convert_uint_rte(int);
+uint __ovld __cnfn convert_uint_sat_rte(int);
+uint __ovld __cnfn convert_uint_rtz(int);
+uint __ovld __cnfn convert_uint_sat_rtz(int);
+uint __ovld __cnfn convert_uint_rtp(int);
+uint __ovld __cnfn convert_uint_sat_rtp(int);
+uint __ovld __cnfn convert_uint_rtn(int);
+uint __ovld __cnfn convert_uint_sat_rtn(int);
+uint __ovld __cnfn convert_uint(int);
+uint __ovld __cnfn convert_uint_sat(int);
+uint __ovld __cnfn convert_uint_rte(uint);
+uint __ovld __cnfn convert_uint_sat_rte(uint);
+uint __ovld __cnfn convert_uint_rtz(uint);
+uint __ovld __cnfn convert_uint_sat_rtz(uint);
+uint __ovld __cnfn convert_uint_rtp(uint);
+uint __ovld __cnfn convert_uint_sat_rtp(uint);
+uint __ovld __cnfn convert_uint_rtn(uint);
+uint __ovld __cnfn convert_uint_sat_rtn(uint);
+uint __ovld __cnfn convert_uint(uint);
+uint __ovld __cnfn convert_uint_sat(uint);
+uint __ovld __cnfn convert_uint_rte(long);
+uint __ovld __cnfn convert_uint_sat_rte(long);
+uint __ovld __cnfn convert_uint_rtz(long);
+uint __ovld __cnfn convert_uint_sat_rtz(long);
+uint __ovld __cnfn convert_uint_rtp(long);
+uint __ovld __cnfn convert_uint_sat_rtp(long);
+uint __ovld __cnfn convert_uint_rtn(long);
+uint __ovld __cnfn convert_uint_sat_rtn(long);
+uint __ovld __cnfn convert_uint(long);
+uint __ovld __cnfn convert_uint_sat(long);
+uint __ovld __cnfn convert_uint_rte(ulong);
+uint __ovld __cnfn convert_uint_sat_rte(ulong);
+uint __ovld __cnfn convert_uint_rtz(ulong);
+uint __ovld __cnfn convert_uint_sat_rtz(ulong);
+uint __ovld __cnfn convert_uint_rtp(ulong);
+uint __ovld __cnfn convert_uint_sat_rtp(ulong);
+uint __ovld __cnfn convert_uint_rtn(ulong);
+uint __ovld __cnfn convert_uint_sat_rtn(ulong);
+uint __ovld __cnfn convert_uint(ulong);
+uint __ovld __cnfn convert_uint_sat(ulong);
+uint __ovld __cnfn convert_uint_rte(float);
+uint __ovld __cnfn convert_uint_sat_rte(float);
+uint __ovld __cnfn convert_uint_rtz(float);
+uint __ovld __cnfn convert_uint_sat_rtz(float);
+uint __ovld __cnfn convert_uint_rtp(float);
+uint __ovld __cnfn convert_uint_sat_rtp(float);
+uint __ovld __cnfn convert_uint_rtn(float);
+uint __ovld __cnfn convert_uint_sat_rtn(float);
+uint __ovld __cnfn convert_uint(float);
+uint __ovld __cnfn convert_uint_sat(float);
+long __ovld __cnfn convert_long_rte(char);
+long __ovld __cnfn convert_long_sat_rte(char);
+long __ovld __cnfn convert_long_rtz(char);
+long __ovld __cnfn convert_long_sat_rtz(char);
+long __ovld __cnfn convert_long_rtp(char);
+long __ovld __cnfn convert_long_sat_rtp(char);
+long __ovld __cnfn convert_long_rtn(char);
+long __ovld __cnfn convert_long_sat_rtn(char);
+long __ovld __cnfn convert_long(char);
+long __ovld __cnfn convert_long_sat(char);
+long __ovld __cnfn convert_long_rte(uchar);
+long __ovld __cnfn convert_long_sat_rte(uchar);
+long __ovld __cnfn convert_long_rtz(uchar);
+long __ovld __cnfn convert_long_sat_rtz(uchar);
+long __ovld __cnfn convert_long_rtp(uchar);
+long __ovld __cnfn convert_long_sat_rtp(uchar);
+long __ovld __cnfn convert_long_rtn(uchar);
+long __ovld __cnfn convert_long_sat_rtn(uchar);
+long __ovld __cnfn convert_long(uchar);
+long __ovld __cnfn convert_long_sat(uchar);
+long __ovld __cnfn convert_long_rte(short);
+long __ovld __cnfn convert_long_sat_rte(short);
+long __ovld __cnfn convert_long_rtz(short);
+long __ovld __cnfn convert_long_sat_rtz(short);
+long __ovld __cnfn convert_long_rtp(short);
+long __ovld __cnfn convert_long_sat_rtp(short);
+long __ovld __cnfn convert_long_rtn(short);
+long __ovld __cnfn convert_long_sat_rtn(short);
+long __ovld __cnfn convert_long(short);
+long __ovld __cnfn convert_long_sat(short);
+long __ovld __cnfn convert_long_rte(ushort);
+long __ovld __cnfn convert_long_sat_rte(ushort);
+long __ovld __cnfn convert_long_rtz(ushort);
+long __ovld __cnfn convert_long_sat_rtz(ushort);
+long __ovld __cnfn convert_long_rtp(ushort);
+long __ovld __cnfn convert_long_sat_rtp(ushort);
+long __ovld __cnfn convert_long_rtn(ushort);
+long __ovld __cnfn convert_long_sat_rtn(ushort);
+long __ovld __cnfn convert_long(ushort);
+long __ovld __cnfn convert_long_sat(ushort);
+long __ovld __cnfn convert_long_rte(int);
+long __ovld __cnfn convert_long_sat_rte(int);
+long __ovld __cnfn convert_long_rtz(int);
+long __ovld __cnfn convert_long_sat_rtz(int);
+long __ovld __cnfn convert_long_rtp(int);
+long __ovld __cnfn convert_long_sat_rtp(int);
+long __ovld __cnfn convert_long_rtn(int);
+long __ovld __cnfn convert_long_sat_rtn(int);
+long __ovld __cnfn convert_long(int);
+long __ovld __cnfn convert_long_sat(int);
+long __ovld __cnfn convert_long_rte(uint);
+long __ovld __cnfn convert_long_sat_rte(uint);
+long __ovld __cnfn convert_long_rtz(uint);
+long __ovld __cnfn convert_long_sat_rtz(uint);
+long __ovld __cnfn convert_long_rtp(uint);
+long __ovld __cnfn convert_long_sat_rtp(uint);
+long __ovld __cnfn convert_long_rtn(uint);
+long __ovld __cnfn convert_long_sat_rtn(uint);
+long __ovld __cnfn convert_long(uint);
+long __ovld __cnfn convert_long_sat(uint);
+long __ovld __cnfn convert_long_rte(long);
+long __ovld __cnfn convert_long_sat_rte(long);
+long __ovld __cnfn convert_long_rtz(long);
+long __ovld __cnfn convert_long_sat_rtz(long);
+long __ovld __cnfn convert_long_rtp(long);
+long __ovld __cnfn convert_long_sat_rtp(long);
+long __ovld __cnfn convert_long_rtn(long);
+long __ovld __cnfn convert_long_sat_rtn(long);
+long __ovld __cnfn convert_long(long);
+long __ovld __cnfn convert_long_sat(long);
+long __ovld __cnfn convert_long_rte(ulong);
+long __ovld __cnfn convert_long_sat_rte(ulong);
+long __ovld __cnfn convert_long_rtz(ulong);
+long __ovld __cnfn convert_long_sat_rtz(ulong);
+long __ovld __cnfn convert_long_rtp(ulong);
+long __ovld __cnfn convert_long_sat_rtp(ulong);
+long __ovld __cnfn convert_long_rtn(ulong);
+long __ovld __cnfn convert_long_sat_rtn(ulong);
+long __ovld __cnfn convert_long(ulong);
+long __ovld __cnfn convert_long_sat(ulong);
+long __ovld __cnfn convert_long_rte(float);
+long __ovld __cnfn convert_long_sat_rte(float);
+long __ovld __cnfn convert_long_rtz(float);
+long __ovld __cnfn convert_long_sat_rtz(float);
+long __ovld __cnfn convert_long_rtp(float);
+long __ovld __cnfn convert_long_sat_rtp(float);
+long __ovld __cnfn convert_long_rtn(float);
+long __ovld __cnfn convert_long_sat_rtn(float);
+long __ovld __cnfn convert_long(float);
+long __ovld __cnfn convert_long_sat(float);
+ulong __ovld __cnfn convert_ulong_rte(char);
+ulong __ovld __cnfn convert_ulong_sat_rte(char);
+ulong __ovld __cnfn convert_ulong_rtz(char);
+ulong __ovld __cnfn convert_ulong_sat_rtz(char);
+ulong __ovld __cnfn convert_ulong_rtp(char);
+ulong __ovld __cnfn convert_ulong_sat_rtp(char);
+ulong __ovld __cnfn convert_ulong_rtn(char);
+ulong __ovld __cnfn convert_ulong_sat_rtn(char);
+ulong __ovld __cnfn convert_ulong(char);
+ulong __ovld __cnfn convert_ulong_sat(char);
+ulong __ovld __cnfn convert_ulong_rte(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rte(uchar);
+ulong __ovld __cnfn convert_ulong_rtz(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtz(uchar);
+ulong __ovld __cnfn convert_ulong_rtp(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtp(uchar);
+ulong __ovld __cnfn convert_ulong_rtn(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtn(uchar);
+ulong __ovld __cnfn convert_ulong(uchar);
+ulong __ovld __cnfn convert_ulong_sat(uchar);
+ulong __ovld __cnfn convert_ulong_rte(short);
+ulong __ovld __cnfn convert_ulong_sat_rte(short);
+ulong __ovld __cnfn convert_ulong_rtz(short);
+ulong __ovld __cnfn convert_ulong_sat_rtz(short);
+ulong __ovld __cnfn convert_ulong_rtp(short);
+ulong __ovld __cnfn convert_ulong_sat_rtp(short);
+ulong __ovld __cnfn convert_ulong_rtn(short);
+ulong __ovld __cnfn convert_ulong_sat_rtn(short);
+ulong __ovld __cnfn convert_ulong(short);
+ulong __ovld __cnfn convert_ulong_sat(short);
+ulong __ovld __cnfn convert_ulong_rte(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rte(ushort);
+ulong __ovld __cnfn convert_ulong_rtz(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtz(ushort);
+ulong __ovld __cnfn convert_ulong_rtp(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtp(ushort);
+ulong __ovld __cnfn convert_ulong_rtn(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtn(ushort);
+ulong __ovld __cnfn convert_ulong(ushort);
+ulong __ovld __cnfn convert_ulong_sat(ushort);
+ulong __ovld __cnfn convert_ulong_rte(int);
+ulong __ovld __cnfn convert_ulong_sat_rte(int);
+ulong __ovld __cnfn convert_ulong_rtz(int);
+ulong __ovld __cnfn convert_ulong_sat_rtz(int);
+ulong __ovld __cnfn convert_ulong_rtp(int);
+ulong __ovld __cnfn convert_ulong_sat_rtp(int);
+ulong __ovld __cnfn convert_ulong_rtn(int);
+ulong __ovld __cnfn convert_ulong_sat_rtn(int);
+ulong __ovld __cnfn convert_ulong(int);
+ulong __ovld __cnfn convert_ulong_sat(int);
+ulong __ovld __cnfn convert_ulong_rte(uint);
+ulong __ovld __cnfn convert_ulong_sat_rte(uint);
+ulong __ovld __cnfn convert_ulong_rtz(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtz(uint);
+ulong __ovld __cnfn convert_ulong_rtp(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtp(uint);
+ulong __ovld __cnfn convert_ulong_rtn(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtn(uint);
+ulong __ovld __cnfn convert_ulong(uint);
+ulong __ovld __cnfn convert_ulong_sat(uint);
+ulong __ovld __cnfn convert_ulong_rte(long);
+ulong __ovld __cnfn convert_ulong_sat_rte(long);
+ulong __ovld __cnfn convert_ulong_rtz(long);
+ulong __ovld __cnfn convert_ulong_sat_rtz(long);
+ulong __ovld __cnfn convert_ulong_rtp(long);
+ulong __ovld __cnfn convert_ulong_sat_rtp(long);
+ulong __ovld __cnfn convert_ulong_rtn(long);
+ulong __ovld __cnfn convert_ulong_sat_rtn(long);
+ulong __ovld __cnfn convert_ulong(long);
+ulong __ovld __cnfn convert_ulong_sat(long);
+ulong __ovld __cnfn convert_ulong_rte(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rte(ulong);
+ulong __ovld __cnfn convert_ulong_rtz(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtz(ulong);
+ulong __ovld __cnfn convert_ulong_rtp(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtp(ulong);
+ulong __ovld __cnfn convert_ulong_rtn(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtn(ulong);
+ulong __ovld __cnfn convert_ulong(ulong);
+ulong __ovld __cnfn convert_ulong_sat(ulong);
+ulong __ovld __cnfn convert_ulong_rte(float);
+ulong __ovld __cnfn convert_ulong_sat_rte(float);
+ulong __ovld __cnfn convert_ulong_rtz(float);
+ulong __ovld __cnfn convert_ulong_sat_rtz(float);
+ulong __ovld __cnfn convert_ulong_rtp(float);
+ulong __ovld __cnfn convert_ulong_sat_rtp(float);
+ulong __ovld __cnfn convert_ulong_rtn(float);
+ulong __ovld __cnfn convert_ulong_sat_rtn(float);
+ulong __ovld __cnfn convert_ulong(float);
+ulong __ovld __cnfn convert_ulong_sat(float);
+float __ovld __cnfn convert_float_rte(char);
+float __ovld __cnfn convert_float_rtz(char);
+float __ovld __cnfn convert_float_rtp(char);
+float __ovld __cnfn convert_float_rtn(char);
+float __ovld __cnfn convert_float(char);
+float __ovld __cnfn convert_float_rte(uchar);
+float __ovld __cnfn convert_float_rtz(uchar);
+float __ovld __cnfn convert_float_rtp(uchar);
+float __ovld __cnfn convert_float_rtn(uchar);
+float __ovld __cnfn convert_float(uchar);
+float __ovld __cnfn convert_float_rte(short);
+float __ovld __cnfn convert_float_rtz(short);
+float __ovld __cnfn convert_float_rtp(short);
+float __ovld __cnfn convert_float_rtn(short);
+float __ovld __cnfn convert_float(short);
+float __ovld __cnfn convert_float_rte(ushort);
+float __ovld __cnfn convert_float_rtz(ushort);
+float __ovld __cnfn convert_float_rtp(ushort);
+float __ovld __cnfn convert_float_rtn(ushort);
+float __ovld __cnfn convert_float(ushort);
+float __ovld __cnfn convert_float_rte(int);
+float __ovld __cnfn convert_float_rtz(int);
+float __ovld __cnfn convert_float_rtp(int);
+float __ovld __cnfn convert_float_rtn(int);
+float __ovld __cnfn convert_float(int);
+float __ovld __cnfn convert_float_rte(uint);
+float __ovld __cnfn convert_float_rtz(uint);
+float __ovld __cnfn convert_float_rtp(uint);
+float __ovld __cnfn convert_float_rtn(uint);
+float __ovld __cnfn convert_float(uint);
+float __ovld __cnfn convert_float_rte(long);
+float __ovld __cnfn convert_float_rtz(long);
+float __ovld __cnfn convert_float_rtp(long);
+float __ovld __cnfn convert_float_rtn(long);
+float __ovld __cnfn convert_float(long);
+float __ovld __cnfn convert_float_rte(ulong);
+float __ovld __cnfn convert_float_rtz(ulong);
+float __ovld __cnfn convert_float_rtp(ulong);
+float __ovld __cnfn convert_float_rtn(ulong);
+float __ovld __cnfn convert_float(ulong);
+float __ovld __cnfn convert_float_rte(float);
+float __ovld __cnfn convert_float_rtz(float);
+float __ovld __cnfn convert_float_rtp(float);
+float __ovld __cnfn convert_float_rtn(float);
+float __ovld __cnfn convert_float(float);
+char2 __ovld __cnfn convert_char2_rte(char2);
+char2 __ovld __cnfn convert_char2_sat_rte(char2);
+char2 __ovld __cnfn convert_char2_rtz(char2);
+char2 __ovld __cnfn convert_char2_sat_rtz(char2);
+char2 __ovld __cnfn convert_char2_rtp(char2);
+char2 __ovld __cnfn convert_char2_sat_rtp(char2);
+char2 __ovld __cnfn convert_char2_rtn(char2);
+char2 __ovld __cnfn convert_char2_sat_rtn(char2);
+char2 __ovld __cnfn convert_char2(char2);
+char2 __ovld __cnfn convert_char2_sat(char2);
+char2 __ovld __cnfn convert_char2_rte(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rte(uchar2);
+char2 __ovld __cnfn convert_char2_rtz(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtz(uchar2);
+char2 __ovld __cnfn convert_char2_rtp(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtp(uchar2);
+char2 __ovld __cnfn convert_char2_rtn(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtn(uchar2);
+char2 __ovld __cnfn convert_char2(uchar2);
+char2 __ovld __cnfn convert_char2_sat(uchar2);
+char2 __ovld __cnfn convert_char2_rte(short2);
+char2 __ovld __cnfn convert_char2_sat_rte(short2);
+char2 __ovld __cnfn convert_char2_rtz(short2);
+char2 __ovld __cnfn convert_char2_sat_rtz(short2);
+char2 __ovld __cnfn convert_char2_rtp(short2);
+char2 __ovld __cnfn convert_char2_sat_rtp(short2);
+char2 __ovld __cnfn convert_char2_rtn(short2);
+char2 __ovld __cnfn convert_char2_sat_rtn(short2);
+char2 __ovld __cnfn convert_char2(short2);
+char2 __ovld __cnfn convert_char2_sat(short2);
+char2 __ovld __cnfn convert_char2_rte(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rte(ushort2);
+char2 __ovld __cnfn convert_char2_rtz(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtz(ushort2);
+char2 __ovld __cnfn convert_char2_rtp(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtp(ushort2);
+char2 __ovld __cnfn convert_char2_rtn(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtn(ushort2);
+char2 __ovld __cnfn convert_char2(ushort2);
+char2 __ovld __cnfn convert_char2_sat(ushort2);
+char2 __ovld __cnfn convert_char2_rte(int2);
+char2 __ovld __cnfn convert_char2_sat_rte(int2);
+char2 __ovld __cnfn convert_char2_rtz(int2);
+char2 __ovld __cnfn convert_char2_sat_rtz(int2);
+char2 __ovld __cnfn convert_char2_rtp(int2);
+char2 __ovld __cnfn convert_char2_sat_rtp(int2);
+char2 __ovld __cnfn convert_char2_rtn(int2);
+char2 __ovld __cnfn convert_char2_sat_rtn(int2);
+char2 __ovld __cnfn convert_char2(int2);
+char2 __ovld __cnfn convert_char2_sat(int2);
+char2 __ovld __cnfn convert_char2_rte(uint2);
+char2 __ovld __cnfn convert_char2_sat_rte(uint2);
+char2 __ovld __cnfn convert_char2_rtz(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtz(uint2);
+char2 __ovld __cnfn convert_char2_rtp(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtp(uint2);
+char2 __ovld __cnfn convert_char2_rtn(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtn(uint2);
+char2 __ovld __cnfn convert_char2(uint2);
+char2 __ovld __cnfn convert_char2_sat(uint2);
+char2 __ovld __cnfn convert_char2_rte(long2);
+char2 __ovld __cnfn convert_char2_sat_rte(long2);
+char2 __ovld __cnfn convert_char2_rtz(long2);
+char2 __ovld __cnfn convert_char2_sat_rtz(long2);
+char2 __ovld __cnfn convert_char2_rtp(long2);
+char2 __ovld __cnfn convert_char2_sat_rtp(long2);
+char2 __ovld __cnfn convert_char2_rtn(long2);
+char2 __ovld __cnfn convert_char2_sat_rtn(long2);
+char2 __ovld __cnfn convert_char2(long2);
+char2 __ovld __cnfn convert_char2_sat(long2);
+char2 __ovld __cnfn convert_char2_rte(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rte(ulong2);
+char2 __ovld __cnfn convert_char2_rtz(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtz(ulong2);
+char2 __ovld __cnfn convert_char2_rtp(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtp(ulong2);
+char2 __ovld __cnfn convert_char2_rtn(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtn(ulong2);
+char2 __ovld __cnfn convert_char2(ulong2);
+char2 __ovld __cnfn convert_char2_sat(ulong2);
+char2 __ovld __cnfn convert_char2_rte(float2);
+char2 __ovld __cnfn convert_char2_sat_rte(float2);
+char2 __ovld __cnfn convert_char2_rtz(float2);
+char2 __ovld __cnfn convert_char2_sat_rtz(float2);
+char2 __ovld __cnfn convert_char2_rtp(float2);
+char2 __ovld __cnfn convert_char2_sat_rtp(float2);
+char2 __ovld __cnfn convert_char2_rtn(float2);
+char2 __ovld __cnfn convert_char2_sat_rtn(float2);
+char2 __ovld __cnfn convert_char2(float2);
+char2 __ovld __cnfn convert_char2_sat(float2);
+uchar2 __ovld __cnfn convert_uchar2_rte(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(char2);
+uchar2 __ovld __cnfn convert_uchar2(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat(char2);
+uchar2 __ovld __cnfn convert_uchar2_rte(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uchar2);
+uchar2 __ovld __cnfn convert_uchar2(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rte(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(short2);
+uchar2 __ovld __cnfn convert_uchar2(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat(short2);
+uchar2 __ovld __cnfn convert_uchar2_rte(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ushort2);
+uchar2 __ovld __cnfn convert_uchar2(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rte(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(int2);
+uchar2 __ovld __cnfn convert_uchar2(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat(int2);
+uchar2 __ovld __cnfn convert_uchar2_rte(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uint2);
+uchar2 __ovld __cnfn convert_uchar2(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rte(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(long2);
+uchar2 __ovld __cnfn convert_uchar2(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat(long2);
+uchar2 __ovld __cnfn convert_uchar2_rte(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ulong2);
+uchar2 __ovld __cnfn convert_uchar2(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rte(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(float2);
+uchar2 __ovld __cnfn convert_uchar2(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat(float2);
+short2 __ovld __cnfn convert_short2_rte(char2);
+short2 __ovld __cnfn convert_short2_sat_rte(char2);
+short2 __ovld __cnfn convert_short2_rtz(char2);
+short2 __ovld __cnfn convert_short2_sat_rtz(char2);
+short2 __ovld __cnfn convert_short2_rtp(char2);
+short2 __ovld __cnfn convert_short2_sat_rtp(char2);
+short2 __ovld __cnfn convert_short2_rtn(char2);
+short2 __ovld __cnfn convert_short2_sat_rtn(char2);
+short2 __ovld __cnfn convert_short2(char2);
+short2 __ovld __cnfn convert_short2_sat(char2);
+short2 __ovld __cnfn convert_short2_rte(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rte(uchar2);
+short2 __ovld __cnfn convert_short2_rtz(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtz(uchar2);
+short2 __ovld __cnfn convert_short2_rtp(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtp(uchar2);
+short2 __ovld __cnfn convert_short2_rtn(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtn(uchar2);
+short2 __ovld __cnfn convert_short2(uchar2);
+short2 __ovld __cnfn convert_short2_sat(uchar2);
+short2 __ovld __cnfn convert_short2_rte(short2);
+short2 __ovld __cnfn convert_short2_sat_rte(short2);
+short2 __ovld __cnfn convert_short2_rtz(short2);
+short2 __ovld __cnfn convert_short2_sat_rtz(short2);
+short2 __ovld __cnfn convert_short2_rtp(short2);
+short2 __ovld __cnfn convert_short2_sat_rtp(short2);
+short2 __ovld __cnfn convert_short2_rtn(short2);
+short2 __ovld __cnfn convert_short2_sat_rtn(short2);
+short2 __ovld __cnfn convert_short2(short2);
+short2 __ovld __cnfn convert_short2_sat(short2);
+short2 __ovld __cnfn convert_short2_rte(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rte(ushort2);
+short2 __ovld __cnfn convert_short2_rtz(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtz(ushort2);
+short2 __ovld __cnfn convert_short2_rtp(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtp(ushort2);
+short2 __ovld __cnfn convert_short2_rtn(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtn(ushort2);
+short2 __ovld __cnfn convert_short2(ushort2);
+short2 __ovld __cnfn convert_short2_sat(ushort2);
+short2 __ovld __cnfn convert_short2_rte(int2);
+short2 __ovld __cnfn convert_short2_sat_rte(int2);
+short2 __ovld __cnfn convert_short2_rtz(int2);
+short2 __ovld __cnfn convert_short2_sat_rtz(int2);
+short2 __ovld __cnfn convert_short2_rtp(int2);
+short2 __ovld __cnfn convert_short2_sat_rtp(int2);
+short2 __ovld __cnfn convert_short2_rtn(int2);
+short2 __ovld __cnfn convert_short2_sat_rtn(int2);
+short2 __ovld __cnfn convert_short2(int2);
+short2 __ovld __cnfn convert_short2_sat(int2);
+short2 __ovld __cnfn convert_short2_rte(uint2);
+short2 __ovld __cnfn convert_short2_sat_rte(uint2);
+short2 __ovld __cnfn convert_short2_rtz(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtz(uint2);
+short2 __ovld __cnfn convert_short2_rtp(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtp(uint2);
+short2 __ovld __cnfn convert_short2_rtn(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtn(uint2);
+short2 __ovld __cnfn convert_short2(uint2);
+short2 __ovld __cnfn convert_short2_sat(uint2);
+short2 __ovld __cnfn convert_short2_rte(long2);
+short2 __ovld __cnfn convert_short2_sat_rte(long2);
+short2 __ovld __cnfn convert_short2_rtz(long2);
+short2 __ovld __cnfn convert_short2_sat_rtz(long2);
+short2 __ovld __cnfn convert_short2_rtp(long2);
+short2 __ovld __cnfn convert_short2_sat_rtp(long2);
+short2 __ovld __cnfn convert_short2_rtn(long2);
+short2 __ovld __cnfn convert_short2_sat_rtn(long2);
+short2 __ovld __cnfn convert_short2(long2);
+short2 __ovld __cnfn convert_short2_sat(long2);
+short2 __ovld __cnfn convert_short2_rte(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rte(ulong2);
+short2 __ovld __cnfn convert_short2_rtz(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtz(ulong2);
+short2 __ovld __cnfn convert_short2_rtp(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtp(ulong2);
+short2 __ovld __cnfn convert_short2_rtn(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtn(ulong2);
+short2 __ovld __cnfn convert_short2(ulong2);
+short2 __ovld __cnfn convert_short2_sat(ulong2);
+short2 __ovld __cnfn convert_short2_rte(float2);
+short2 __ovld __cnfn convert_short2_sat_rte(float2);
+short2 __ovld __cnfn convert_short2_rtz(float2);
+short2 __ovld __cnfn convert_short2_sat_rtz(float2);
+short2 __ovld __cnfn convert_short2_rtp(float2);
+short2 __ovld __cnfn convert_short2_sat_rtp(float2);
+short2 __ovld __cnfn convert_short2_rtn(float2);
+short2 __ovld __cnfn convert_short2_sat_rtn(float2);
+short2 __ovld __cnfn convert_short2(float2);
+short2 __ovld __cnfn convert_short2_sat(float2);
+ushort2 __ovld __cnfn convert_ushort2_rte(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(char2);
+ushort2 __ovld __cnfn convert_ushort2(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat(char2);
+ushort2 __ovld __cnfn convert_ushort2_rte(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uchar2);
+ushort2 __ovld __cnfn convert_ushort2(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rte(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(short2);
+ushort2 __ovld __cnfn convert_ushort2(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat(short2);
+ushort2 __ovld __cnfn convert_ushort2_rte(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ushort2);
+ushort2 __ovld __cnfn convert_ushort2(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rte(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(int2);
+ushort2 __ovld __cnfn convert_ushort2(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat(int2);
+ushort2 __ovld __cnfn convert_ushort2_rte(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uint2);
+ushort2 __ovld __cnfn convert_ushort2(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rte(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(long2);
+ushort2 __ovld __cnfn convert_ushort2(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat(long2);
+ushort2 __ovld __cnfn convert_ushort2_rte(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ulong2);
+ushort2 __ovld __cnfn convert_ushort2(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rte(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(float2);
+ushort2 __ovld __cnfn convert_ushort2(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat(float2);
+int2 __ovld __cnfn convert_int2_rte(char2);
+int2 __ovld __cnfn convert_int2_sat_rte(char2);
+int2 __ovld __cnfn convert_int2_rtz(char2);
+int2 __ovld __cnfn convert_int2_sat_rtz(char2);
+int2 __ovld __cnfn convert_int2_rtp(char2);
+int2 __ovld __cnfn convert_int2_sat_rtp(char2);
+int2 __ovld __cnfn convert_int2_rtn(char2);
+int2 __ovld __cnfn convert_int2_sat_rtn(char2);
+int2 __ovld __cnfn convert_int2(char2);
+int2 __ovld __cnfn convert_int2_sat(char2);
+int2 __ovld __cnfn convert_int2_rte(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rte(uchar2);
+int2 __ovld __cnfn convert_int2_rtz(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtz(uchar2);
+int2 __ovld __cnfn convert_int2_rtp(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtp(uchar2);
+int2 __ovld __cnfn convert_int2_rtn(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtn(uchar2);
+int2 __ovld __cnfn convert_int2(uchar2);
+int2 __ovld __cnfn convert_int2_sat(uchar2);
+int2 __ovld __cnfn convert_int2_rte(short2);
+int2 __ovld __cnfn convert_int2_sat_rte(short2);
+int2 __ovld __cnfn convert_int2_rtz(short2);
+int2 __ovld __cnfn convert_int2_sat_rtz(short2);
+int2 __ovld __cnfn convert_int2_rtp(short2);
+int2 __ovld __cnfn convert_int2_sat_rtp(short2);
+int2 __ovld __cnfn convert_int2_rtn(short2);
+int2 __ovld __cnfn convert_int2_sat_rtn(short2);
+int2 __ovld __cnfn convert_int2(short2);
+int2 __ovld __cnfn convert_int2_sat(short2);
+int2 __ovld __cnfn convert_int2_rte(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rte(ushort2);
+int2 __ovld __cnfn convert_int2_rtz(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtz(ushort2);
+int2 __ovld __cnfn convert_int2_rtp(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtp(ushort2);
+int2 __ovld __cnfn convert_int2_rtn(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtn(ushort2);
+int2 __ovld __cnfn convert_int2(ushort2);
+int2 __ovld __cnfn convert_int2_sat(ushort2);
+int2 __ovld __cnfn convert_int2_rte(int2);
+int2 __ovld __cnfn convert_int2_sat_rte(int2);
+int2 __ovld __cnfn convert_int2_rtz(int2);
+int2 __ovld __cnfn convert_int2_sat_rtz(int2);
+int2 __ovld __cnfn convert_int2_rtp(int2);
+int2 __ovld __cnfn convert_int2_sat_rtp(int2);
+int2 __ovld __cnfn convert_int2_rtn(int2);
+int2 __ovld __cnfn convert_int2_sat_rtn(int2);
+int2 __ovld __cnfn convert_int2(int2);
+int2 __ovld __cnfn convert_int2_sat(int2);
+int2 __ovld __cnfn convert_int2_rte(uint2);
+int2 __ovld __cnfn convert_int2_sat_rte(uint2);
+int2 __ovld __cnfn convert_int2_rtz(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtz(uint2);
+int2 __ovld __cnfn convert_int2_rtp(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtp(uint2);
+int2 __ovld __cnfn convert_int2_rtn(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtn(uint2);
+int2 __ovld __cnfn convert_int2(uint2);
+int2 __ovld __cnfn convert_int2_sat(uint2);
+int2 __ovld __cnfn convert_int2_rte(long2);
+int2 __ovld __cnfn convert_int2_sat_rte(long2);
+int2 __ovld __cnfn convert_int2_rtz(long2);
+int2 __ovld __cnfn convert_int2_sat_rtz(long2);
+int2 __ovld __cnfn convert_int2_rtp(long2);
+int2 __ovld __cnfn convert_int2_sat_rtp(long2);
+int2 __ovld __cnfn convert_int2_rtn(long2);
+int2 __ovld __cnfn convert_int2_sat_rtn(long2);
+int2 __ovld __cnfn convert_int2(long2);
+int2 __ovld __cnfn convert_int2_sat(long2);
+int2 __ovld __cnfn convert_int2_rte(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rte(ulong2);
+int2 __ovld __cnfn convert_int2_rtz(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtz(ulong2);
+int2 __ovld __cnfn convert_int2_rtp(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtp(ulong2);
+int2 __ovld __cnfn convert_int2_rtn(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtn(ulong2);
+int2 __ovld __cnfn convert_int2(ulong2);
+int2 __ovld __cnfn convert_int2_sat(ulong2);
+int2 __ovld __cnfn convert_int2_rte(float2);
+int2 __ovld __cnfn convert_int2_sat_rte(float2);
+int2 __ovld __cnfn convert_int2_rtz(float2);
+int2 __ovld __cnfn convert_int2_sat_rtz(float2);
+int2 __ovld __cnfn convert_int2_rtp(float2);
+int2 __ovld __cnfn convert_int2_sat_rtp(float2);
+int2 __ovld __cnfn convert_int2_rtn(float2);
+int2 __ovld __cnfn convert_int2_sat_rtn(float2);
+int2 __ovld __cnfn convert_int2(float2);
+int2 __ovld __cnfn convert_int2_sat(float2);
+uint2 __ovld __cnfn convert_uint2_rte(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(char2);
+uint2 __ovld __cnfn convert_uint2_rtz(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(char2);
+uint2 __ovld __cnfn convert_uint2_rtp(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(char2);
+uint2 __ovld __cnfn convert_uint2_rtn(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(char2);
+uint2 __ovld __cnfn convert_uint2(char2);
+uint2 __ovld __cnfn convert_uint2_sat(char2);
+uint2 __ovld __cnfn convert_uint2_rte(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtz(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtp(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtn(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(uchar2);
+uint2 __ovld __cnfn convert_uint2(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat(uchar2);
+uint2 __ovld __cnfn convert_uint2_rte(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(short2);
+uint2 __ovld __cnfn convert_uint2_rtz(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(short2);
+uint2 __ovld __cnfn convert_uint2_rtp(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(short2);
+uint2 __ovld __cnfn convert_uint2_rtn(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(short2);
+uint2 __ovld __cnfn convert_uint2(short2);
+uint2 __ovld __cnfn convert_uint2_sat(short2);
+uint2 __ovld __cnfn convert_uint2_rte(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtz(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtp(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtn(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(ushort2);
+uint2 __ovld __cnfn convert_uint2(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat(ushort2);
+uint2 __ovld __cnfn convert_uint2_rte(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(int2);
+uint2 __ovld __cnfn convert_uint2_rtz(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(int2);
+uint2 __ovld __cnfn convert_uint2_rtp(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(int2);
+uint2 __ovld __cnfn convert_uint2_rtn(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(int2);
+uint2 __ovld __cnfn convert_uint2(int2);
+uint2 __ovld __cnfn convert_uint2_sat(int2);
+uint2 __ovld __cnfn convert_uint2_rte(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(uint2);
+uint2 __ovld __cnfn convert_uint2_rtz(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(uint2);
+uint2 __ovld __cnfn convert_uint2_rtp(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(uint2);
+uint2 __ovld __cnfn convert_uint2_rtn(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(uint2);
+uint2 __ovld __cnfn convert_uint2(uint2);
+uint2 __ovld __cnfn convert_uint2_sat(uint2);
+uint2 __ovld __cnfn convert_uint2_rte(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(long2);
+uint2 __ovld __cnfn convert_uint2_rtz(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(long2);
+uint2 __ovld __cnfn convert_uint2_rtp(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(long2);
+uint2 __ovld __cnfn convert_uint2_rtn(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(long2);
+uint2 __ovld __cnfn convert_uint2(long2);
+uint2 __ovld __cnfn convert_uint2_sat(long2);
+uint2 __ovld __cnfn convert_uint2_rte(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtz(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtp(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtn(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(ulong2);
+uint2 __ovld __cnfn convert_uint2(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat(ulong2);
+uint2 __ovld __cnfn convert_uint2_rte(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(float2);
+uint2 __ovld __cnfn convert_uint2_rtz(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(float2);
+uint2 __ovld __cnfn convert_uint2_rtp(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(float2);
+uint2 __ovld __cnfn convert_uint2_rtn(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(float2);
+uint2 __ovld __cnfn convert_uint2(float2);
+uint2 __ovld __cnfn convert_uint2_sat(float2);
+long2 __ovld __cnfn convert_long2_rte(char2);
+long2 __ovld __cnfn convert_long2_sat_rte(char2);
+long2 __ovld __cnfn convert_long2_rtz(char2);
+long2 __ovld __cnfn convert_long2_sat_rtz(char2);
+long2 __ovld __cnfn convert_long2_rtp(char2);
+long2 __ovld __cnfn convert_long2_sat_rtp(char2);
+long2 __ovld __cnfn convert_long2_rtn(char2);
+long2 __ovld __cnfn convert_long2_sat_rtn(char2);
+long2 __ovld __cnfn convert_long2(char2);
+long2 __ovld __cnfn convert_long2_sat(char2);
+long2 __ovld __cnfn convert_long2_rte(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rte(uchar2);
+long2 __ovld __cnfn convert_long2_rtz(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtz(uchar2);
+long2 __ovld __cnfn convert_long2_rtp(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtp(uchar2);
+long2 __ovld __cnfn convert_long2_rtn(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtn(uchar2);
+long2 __ovld __cnfn convert_long2(uchar2);
+long2 __ovld __cnfn convert_long2_sat(uchar2);
+long2 __ovld __cnfn convert_long2_rte(short2);
+long2 __ovld __cnfn convert_long2_sat_rte(short2);
+long2 __ovld __cnfn convert_long2_rtz(short2);
+long2 __ovld __cnfn convert_long2_sat_rtz(short2);
+long2 __ovld __cnfn convert_long2_rtp(short2);
+long2 __ovld __cnfn convert_long2_sat_rtp(short2);
+long2 __ovld __cnfn convert_long2_rtn(short2);
+long2 __ovld __cnfn convert_long2_sat_rtn(short2);
+long2 __ovld __cnfn convert_long2(short2);
+long2 __ovld __cnfn convert_long2_sat(short2);
+long2 __ovld __cnfn convert_long2_rte(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rte(ushort2);
+long2 __ovld __cnfn convert_long2_rtz(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtz(ushort2);
+long2 __ovld __cnfn convert_long2_rtp(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtp(ushort2);
+long2 __ovld __cnfn convert_long2_rtn(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtn(ushort2);
+long2 __ovld __cnfn convert_long2(ushort2);
+long2 __ovld __cnfn convert_long2_sat(ushort2);
+long2 __ovld __cnfn convert_long2_rte(int2);
+long2 __ovld __cnfn convert_long2_sat_rte(int2);
+long2 __ovld __cnfn convert_long2_rtz(int2);
+long2 __ovld __cnfn convert_long2_sat_rtz(int2);
+long2 __ovld __cnfn convert_long2_rtp(int2);
+long2 __ovld __cnfn convert_long2_sat_rtp(int2);
+long2 __ovld __cnfn convert_long2_rtn(int2);
+long2 __ovld __cnfn convert_long2_sat_rtn(int2);
+long2 __ovld __cnfn convert_long2(int2);
+long2 __ovld __cnfn convert_long2_sat(int2);
+long2 __ovld __cnfn convert_long2_rte(uint2);
+long2 __ovld __cnfn convert_long2_sat_rte(uint2);
+long2 __ovld __cnfn convert_long2_rtz(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtz(uint2);
+long2 __ovld __cnfn convert_long2_rtp(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtp(uint2);
+long2 __ovld __cnfn convert_long2_rtn(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtn(uint2);
+long2 __ovld __cnfn convert_long2(uint2);
+long2 __ovld __cnfn convert_long2_sat(uint2);
+long2 __ovld __cnfn convert_long2_rte(long2);
+long2 __ovld __cnfn convert_long2_sat_rte(long2);
+long2 __ovld __cnfn convert_long2_rtz(long2);
+long2 __ovld __cnfn convert_long2_sat_rtz(long2);
+long2 __ovld __cnfn convert_long2_rtp(long2);
+long2 __ovld __cnfn convert_long2_sat_rtp(long2);
+long2 __ovld __cnfn convert_long2_rtn(long2);
+long2 __ovld __cnfn convert_long2_sat_rtn(long2);
+long2 __ovld __cnfn convert_long2(long2);
+long2 __ovld __cnfn convert_long2_sat(long2);
+long2 __ovld __cnfn convert_long2_rte(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rte(ulong2);
+long2 __ovld __cnfn convert_long2_rtz(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtz(ulong2);
+long2 __ovld __cnfn convert_long2_rtp(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtp(ulong2);
+long2 __ovld __cnfn convert_long2_rtn(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtn(ulong2);
+long2 __ovld __cnfn convert_long2(ulong2);
+long2 __ovld __cnfn convert_long2_sat(ulong2);
+long2 __ovld __cnfn convert_long2_rte(float2);
+long2 __ovld __cnfn convert_long2_sat_rte(float2);
+long2 __ovld __cnfn convert_long2_rtz(float2);
+long2 __ovld __cnfn convert_long2_sat_rtz(float2);
+long2 __ovld __cnfn convert_long2_rtp(float2);
+long2 __ovld __cnfn convert_long2_sat_rtp(float2);
+long2 __ovld __cnfn convert_long2_rtn(float2);
+long2 __ovld __cnfn convert_long2_sat_rtn(float2);
+long2 __ovld __cnfn convert_long2(float2);
+long2 __ovld __cnfn convert_long2_sat(float2);
+ulong2 __ovld __cnfn convert_ulong2_rte(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(char2);
+ulong2 __ovld __cnfn convert_ulong2(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat(char2);
+ulong2 __ovld __cnfn convert_ulong2_rte(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uchar2);
+ulong2 __ovld __cnfn convert_ulong2(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rte(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(short2);
+ulong2 __ovld __cnfn convert_ulong2(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat(short2);
+ulong2 __ovld __cnfn convert_ulong2_rte(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ushort2);
+ulong2 __ovld __cnfn convert_ulong2(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rte(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(int2);
+ulong2 __ovld __cnfn convert_ulong2(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat(int2);
+ulong2 __ovld __cnfn convert_ulong2_rte(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uint2);
+ulong2 __ovld __cnfn convert_ulong2(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rte(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(long2);
+ulong2 __ovld __cnfn convert_ulong2(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat(long2);
+ulong2 __ovld __cnfn convert_ulong2_rte(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ulong2);
+ulong2 __ovld __cnfn convert_ulong2(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rte(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(float2);
+ulong2 __ovld __cnfn convert_ulong2(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat(float2);
+float2 __ovld __cnfn convert_float2_rte(char2);
+float2 __ovld __cnfn convert_float2_rtz(char2);
+float2 __ovld __cnfn convert_float2_rtp(char2);
+float2 __ovld __cnfn convert_float2_rtn(char2);
+float2 __ovld __cnfn convert_float2(char2);
+float2 __ovld __cnfn convert_float2_rte(uchar2);
+float2 __ovld __cnfn convert_float2_rtz(uchar2);
+float2 __ovld __cnfn convert_float2_rtp(uchar2);
+float2 __ovld __cnfn convert_float2_rtn(uchar2);
+float2 __ovld __cnfn convert_float2(uchar2);
+float2 __ovld __cnfn convert_float2_rte(short2);
+float2 __ovld __cnfn convert_float2_rtz(short2);
+float2 __ovld __cnfn convert_float2_rtp(short2);
+float2 __ovld __cnfn convert_float2_rtn(short2);
+float2 __ovld __cnfn convert_float2(short2);
+float2 __ovld __cnfn convert_float2_rte(ushort2);
+float2 __ovld __cnfn convert_float2_rtz(ushort2);
+float2 __ovld __cnfn convert_float2_rtp(ushort2);
+float2 __ovld __cnfn convert_float2_rtn(ushort2);
+float2 __ovld __cnfn convert_float2(ushort2);
+float2 __ovld __cnfn convert_float2_rte(int2);
+float2 __ovld __cnfn convert_float2_rtz(int2);
+float2 __ovld __cnfn convert_float2_rtp(int2);
+float2 __ovld __cnfn convert_float2_rtn(int2);
+float2 __ovld __cnfn convert_float2(int2);
+float2 __ovld __cnfn convert_float2_rte(uint2);
+float2 __ovld __cnfn convert_float2_rtz(uint2);
+float2 __ovld __cnfn convert_float2_rtp(uint2);
+float2 __ovld __cnfn convert_float2_rtn(uint2);
+float2 __ovld __cnfn convert_float2(uint2);
+float2 __ovld __cnfn convert_float2_rte(long2);
+float2 __ovld __cnfn convert_float2_rtz(long2);
+float2 __ovld __cnfn convert_float2_rtp(long2);
+float2 __ovld __cnfn convert_float2_rtn(long2);
+float2 __ovld __cnfn convert_float2(long2);
+float2 __ovld __cnfn convert_float2_rte(ulong2);
+float2 __ovld __cnfn convert_float2_rtz(ulong2);
+float2 __ovld __cnfn convert_float2_rtp(ulong2);
+float2 __ovld __cnfn convert_float2_rtn(ulong2);
+float2 __ovld __cnfn convert_float2(ulong2);
+float2 __ovld __cnfn convert_float2_rte(float2);
+float2 __ovld __cnfn convert_float2_rtz(float2);
+float2 __ovld __cnfn convert_float2_rtp(float2);
+float2 __ovld __cnfn convert_float2_rtn(float2);
+float2 __ovld __cnfn convert_float2(float2);
+char3 __ovld __cnfn convert_char3_rte(char3);
+char3 __ovld __cnfn convert_char3_sat_rte(char3);
+char3 __ovld __cnfn convert_char3_rtz(char3);
+char3 __ovld __cnfn convert_char3_sat_rtz(char3);
+char3 __ovld __cnfn convert_char3_rtp(char3);
+char3 __ovld __cnfn convert_char3_sat_rtp(char3);
+char3 __ovld __cnfn convert_char3_rtn(char3);
+char3 __ovld __cnfn convert_char3_sat_rtn(char3);
+char3 __ovld __cnfn convert_char3(char3);
+char3 __ovld __cnfn convert_char3_sat(char3);
+char3 __ovld __cnfn convert_char3_rte(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rte(uchar3);
+char3 __ovld __cnfn convert_char3_rtz(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtz(uchar3);
+char3 __ovld __cnfn convert_char3_rtp(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtp(uchar3);
+char3 __ovld __cnfn convert_char3_rtn(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtn(uchar3);
+char3 __ovld __cnfn convert_char3(uchar3);
+char3 __ovld __cnfn convert_char3_sat(uchar3);
+char3 __ovld __cnfn convert_char3_rte(short3);
+char3 __ovld __cnfn convert_char3_sat_rte(short3);
+char3 __ovld __cnfn convert_char3_rtz(short3);
+char3 __ovld __cnfn convert_char3_sat_rtz(short3);
+char3 __ovld __cnfn convert_char3_rtp(short3);
+char3 __ovld __cnfn convert_char3_sat_rtp(short3);
+char3 __ovld __cnfn convert_char3_rtn(short3);
+char3 __ovld __cnfn convert_char3_sat_rtn(short3);
+char3 __ovld __cnfn convert_char3(short3);
+char3 __ovld __cnfn convert_char3_sat(short3);
+char3 __ovld __cnfn convert_char3_rte(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rte(ushort3);
+char3 __ovld __cnfn convert_char3_rtz(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtz(ushort3);
+char3 __ovld __cnfn convert_char3_rtp(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtp(ushort3);
+char3 __ovld __cnfn convert_char3_rtn(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtn(ushort3);
+char3 __ovld __cnfn convert_char3(ushort3);
+char3 __ovld __cnfn convert_char3_sat(ushort3);
+char3 __ovld __cnfn convert_char3_rte(int3);
+char3 __ovld __cnfn convert_char3_sat_rte(int3);
+char3 __ovld __cnfn convert_char3_rtz(int3);
+char3 __ovld __cnfn convert_char3_sat_rtz(int3);
+char3 __ovld __cnfn convert_char3_rtp(int3);
+char3 __ovld __cnfn convert_char3_sat_rtp(int3);
+char3 __ovld __cnfn convert_char3_rtn(int3);
+char3 __ovld __cnfn convert_char3_sat_rtn(int3);
+char3 __ovld __cnfn convert_char3(int3);
+char3 __ovld __cnfn convert_char3_sat(int3);
+char3 __ovld __cnfn convert_char3_rte(uint3);
+char3 __ovld __cnfn convert_char3_sat_rte(uint3);
+char3 __ovld __cnfn convert_char3_rtz(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtz(uint3);
+char3 __ovld __cnfn convert_char3_rtp(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtp(uint3);
+char3 __ovld __cnfn convert_char3_rtn(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtn(uint3);
+char3 __ovld __cnfn convert_char3(uint3);
+char3 __ovld __cnfn convert_char3_sat(uint3);
+char3 __ovld __cnfn convert_char3_rte(long3);
+char3 __ovld __cnfn convert_char3_sat_rte(long3);
+char3 __ovld __cnfn convert_char3_rtz(long3);
+char3 __ovld __cnfn convert_char3_sat_rtz(long3);
+char3 __ovld __cnfn convert_char3_rtp(long3);
+char3 __ovld __cnfn convert_char3_sat_rtp(long3);
+char3 __ovld __cnfn convert_char3_rtn(long3);
+char3 __ovld __cnfn convert_char3_sat_rtn(long3);
+char3 __ovld __cnfn convert_char3(long3);
+char3 __ovld __cnfn convert_char3_sat(long3);
+char3 __ovld __cnfn convert_char3_rte(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rte(ulong3);
+char3 __ovld __cnfn convert_char3_rtz(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtz(ulong3);
+char3 __ovld __cnfn convert_char3_rtp(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtp(ulong3);
+char3 __ovld __cnfn convert_char3_rtn(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtn(ulong3);
+char3 __ovld __cnfn convert_char3(ulong3);
+char3 __ovld __cnfn convert_char3_sat(ulong3);
+char3 __ovld __cnfn convert_char3_rte(float3);
+char3 __ovld __cnfn convert_char3_sat_rte(float3);
+char3 __ovld __cnfn convert_char3_rtz(float3);
+char3 __ovld __cnfn convert_char3_sat_rtz(float3);
+char3 __ovld __cnfn convert_char3_rtp(float3);
+char3 __ovld __cnfn convert_char3_sat_rtp(float3);
+char3 __ovld __cnfn convert_char3_rtn(float3);
+char3 __ovld __cnfn convert_char3_sat_rtn(float3);
+char3 __ovld __cnfn convert_char3(float3);
+char3 __ovld __cnfn convert_char3_sat(float3);
+uchar3 __ovld __cnfn convert_uchar3_rte(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(char3);
+uchar3 __ovld __cnfn convert_uchar3(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat(char3);
+uchar3 __ovld __cnfn convert_uchar3_rte(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uchar3);
+uchar3 __ovld __cnfn convert_uchar3(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rte(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(short3);
+uchar3 __ovld __cnfn convert_uchar3(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat(short3);
+uchar3 __ovld __cnfn convert_uchar3_rte(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ushort3);
+uchar3 __ovld __cnfn convert_uchar3(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rte(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(int3);
+uchar3 __ovld __cnfn convert_uchar3(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat(int3);
+uchar3 __ovld __cnfn convert_uchar3_rte(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uint3);
+uchar3 __ovld __cnfn convert_uchar3(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rte(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(long3);
+uchar3 __ovld __cnfn convert_uchar3(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat(long3);
+uchar3 __ovld __cnfn convert_uchar3_rte(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ulong3);
+uchar3 __ovld __cnfn convert_uchar3(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rte(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(float3);
+uchar3 __ovld __cnfn convert_uchar3(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat(float3);
+short3 __ovld __cnfn convert_short3_rte(char3);
+short3 __ovld __cnfn convert_short3_sat_rte(char3);
+short3 __ovld __cnfn convert_short3_rtz(char3);
+short3 __ovld __cnfn convert_short3_sat_rtz(char3);
+short3 __ovld __cnfn convert_short3_rtp(char3);
+short3 __ovld __cnfn convert_short3_sat_rtp(char3);
+short3 __ovld __cnfn convert_short3_rtn(char3);
+short3 __ovld __cnfn convert_short3_sat_rtn(char3);
+short3 __ovld __cnfn convert_short3(char3);
+short3 __ovld __cnfn convert_short3_sat(char3);
+short3 __ovld __cnfn convert_short3_rte(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rte(uchar3);
+short3 __ovld __cnfn convert_short3_rtz(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtz(uchar3);
+short3 __ovld __cnfn convert_short3_rtp(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtp(uchar3);
+short3 __ovld __cnfn convert_short3_rtn(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtn(uchar3);
+short3 __ovld __cnfn convert_short3(uchar3);
+short3 __ovld __cnfn convert_short3_sat(uchar3);
+short3 __ovld __cnfn convert_short3_rte(short3);
+short3 __ovld __cnfn convert_short3_sat_rte(short3);
+short3 __ovld __cnfn convert_short3_rtz(short3);
+short3 __ovld __cnfn convert_short3_sat_rtz(short3);
+short3 __ovld __cnfn convert_short3_rtp(short3);
+short3 __ovld __cnfn convert_short3_sat_rtp(short3);
+short3 __ovld __cnfn convert_short3_rtn(short3);
+short3 __ovld __cnfn convert_short3_sat_rtn(short3);
+short3 __ovld __cnfn convert_short3(short3);
+short3 __ovld __cnfn convert_short3_sat(short3);
+short3 __ovld __cnfn convert_short3_rte(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rte(ushort3);
+short3 __ovld __cnfn convert_short3_rtz(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtz(ushort3);
+short3 __ovld __cnfn convert_short3_rtp(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtp(ushort3);
+short3 __ovld __cnfn convert_short3_rtn(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtn(ushort3);
+short3 __ovld __cnfn convert_short3(ushort3);
+short3 __ovld __cnfn convert_short3_sat(ushort3);
+short3 __ovld __cnfn convert_short3_rte(int3);
+short3 __ovld __cnfn convert_short3_sat_rte(int3);
+short3 __ovld __cnfn convert_short3_rtz(int3);
+short3 __ovld __cnfn convert_short3_sat_rtz(int3);
+short3 __ovld __cnfn convert_short3_rtp(int3);
+short3 __ovld __cnfn convert_short3_sat_rtp(int3);
+short3 __ovld __cnfn convert_short3_rtn(int3);
+short3 __ovld __cnfn convert_short3_sat_rtn(int3);
+short3 __ovld __cnfn convert_short3(int3);
+short3 __ovld __cnfn convert_short3_sat(int3);
+short3 __ovld __cnfn convert_short3_rte(uint3);
+short3 __ovld __cnfn convert_short3_sat_rte(uint3);
+short3 __ovld __cnfn convert_short3_rtz(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtz(uint3);
+short3 __ovld __cnfn convert_short3_rtp(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtp(uint3);
+short3 __ovld __cnfn convert_short3_rtn(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtn(uint3);
+short3 __ovld __cnfn convert_short3(uint3);
+short3 __ovld __cnfn convert_short3_sat(uint3);
+short3 __ovld __cnfn convert_short3_rte(long3);
+short3 __ovld __cnfn convert_short3_sat_rte(long3);
+short3 __ovld __cnfn convert_short3_rtz(long3);
+short3 __ovld __cnfn convert_short3_sat_rtz(long3);
+short3 __ovld __cnfn convert_short3_rtp(long3);
+short3 __ovld __cnfn convert_short3_sat_rtp(long3);
+short3 __ovld __cnfn convert_short3_rtn(long3);
+short3 __ovld __cnfn convert_short3_sat_rtn(long3);
+short3 __ovld __cnfn convert_short3(long3);
+short3 __ovld __cnfn convert_short3_sat(long3);
+short3 __ovld __cnfn convert_short3_rte(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rte(ulong3);
+short3 __ovld __cnfn convert_short3_rtz(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtz(ulong3);
+short3 __ovld __cnfn convert_short3_rtp(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtp(ulong3);
+short3 __ovld __cnfn convert_short3_rtn(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtn(ulong3);
+short3 __ovld __cnfn convert_short3(ulong3);
+short3 __ovld __cnfn convert_short3_sat(ulong3);
+short3 __ovld __cnfn convert_short3_rte(float3);
+short3 __ovld __cnfn convert_short3_sat_rte(float3);
+short3 __ovld __cnfn convert_short3_rtz(float3);
+short3 __ovld __cnfn convert_short3_sat_rtz(float3);
+short3 __ovld __cnfn convert_short3_rtp(float3);
+short3 __ovld __cnfn convert_short3_sat_rtp(float3);
+short3 __ovld __cnfn convert_short3_rtn(float3);
+short3 __ovld __cnfn convert_short3_sat_rtn(float3);
+short3 __ovld __cnfn convert_short3(float3);
+short3 __ovld __cnfn convert_short3_sat(float3);
+ushort3 __ovld __cnfn convert_ushort3_rte(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(char3);
+ushort3 __ovld __cnfn convert_ushort3(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat(char3);
+ushort3 __ovld __cnfn convert_ushort3_rte(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uchar3);
+ushort3 __ovld __cnfn convert_ushort3(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rte(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(short3);
+ushort3 __ovld __cnfn convert_ushort3(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat(short3);
+ushort3 __ovld __cnfn convert_ushort3_rte(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ushort3);
+ushort3 __ovld __cnfn convert_ushort3(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rte(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(int3);
+ushort3 __ovld __cnfn convert_ushort3(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat(int3);
+ushort3 __ovld __cnfn convert_ushort3_rte(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uint3);
+ushort3 __ovld __cnfn convert_ushort3(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rte(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(long3);
+ushort3 __ovld __cnfn convert_ushort3(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat(long3);
+ushort3 __ovld __cnfn convert_ushort3_rte(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ulong3);
+ushort3 __ovld __cnfn convert_ushort3(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rte(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(float3);
+ushort3 __ovld __cnfn convert_ushort3(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat(float3);
+int3 __ovld __cnfn convert_int3_rte(char3);
+int3 __ovld __cnfn convert_int3_sat_rte(char3);
+int3 __ovld __cnfn convert_int3_rtz(char3);
+int3 __ovld __cnfn convert_int3_sat_rtz(char3);
+int3 __ovld __cnfn convert_int3_rtp(char3);
+int3 __ovld __cnfn convert_int3_sat_rtp(char3);
+int3 __ovld __cnfn convert_int3_rtn(char3);
+int3 __ovld __cnfn convert_int3_sat_rtn(char3);
+int3 __ovld __cnfn convert_int3(char3);
+int3 __ovld __cnfn convert_int3_sat(char3);
+int3 __ovld __cnfn convert_int3_rte(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rte(uchar3);
+int3 __ovld __cnfn convert_int3_rtz(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtz(uchar3);
+int3 __ovld __cnfn convert_int3_rtp(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtp(uchar3);
+int3 __ovld __cnfn convert_int3_rtn(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtn(uchar3);
+int3 __ovld __cnfn convert_int3(uchar3);
+int3 __ovld __cnfn convert_int3_sat(uchar3);
+int3 __ovld __cnfn convert_int3_rte(short3);
+int3 __ovld __cnfn convert_int3_sat_rte(short3);
+int3 __ovld __cnfn convert_int3_rtz(short3);
+int3 __ovld __cnfn convert_int3_sat_rtz(short3);
+int3 __ovld __cnfn convert_int3_rtp(short3);
+int3 __ovld __cnfn convert_int3_sat_rtp(short3);
+int3 __ovld __cnfn convert_int3_rtn(short3);
+int3 __ovld __cnfn convert_int3_sat_rtn(short3);
+int3 __ovld __cnfn convert_int3(short3);
+int3 __ovld __cnfn convert_int3_sat(short3);
+int3 __ovld __cnfn convert_int3_rte(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rte(ushort3);
+int3 __ovld __cnfn convert_int3_rtz(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtz(ushort3);
+int3 __ovld __cnfn convert_int3_rtp(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtp(ushort3);
+int3 __ovld __cnfn convert_int3_rtn(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtn(ushort3);
+int3 __ovld __cnfn convert_int3(ushort3);
+int3 __ovld __cnfn convert_int3_sat(ushort3);
+int3 __ovld __cnfn convert_int3_rte(int3);
+int3 __ovld __cnfn convert_int3_sat_rte(int3);
+int3 __ovld __cnfn convert_int3_rtz(int3);
+int3 __ovld __cnfn convert_int3_sat_rtz(int3);
+int3 __ovld __cnfn convert_int3_rtp(int3);
+int3 __ovld __cnfn convert_int3_sat_rtp(int3);
+int3 __ovld __cnfn convert_int3_rtn(int3);
+int3 __ovld __cnfn convert_int3_sat_rtn(int3);
+int3 __ovld __cnfn convert_int3(int3);
+int3 __ovld __cnfn convert_int3_sat(int3);
+int3 __ovld __cnfn convert_int3_rte(uint3);
+int3 __ovld __cnfn convert_int3_sat_rte(uint3);
+int3 __ovld __cnfn convert_int3_rtz(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtz(uint3);
+int3 __ovld __cnfn convert_int3_rtp(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtp(uint3);
+int3 __ovld __cnfn convert_int3_rtn(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtn(uint3);
+int3 __ovld __cnfn convert_int3(uint3);
+int3 __ovld __cnfn convert_int3_sat(uint3);
+int3 __ovld __cnfn convert_int3_rte(long3);
+int3 __ovld __cnfn convert_int3_sat_rte(long3);
+int3 __ovld __cnfn convert_int3_rtz(long3);
+int3 __ovld __cnfn convert_int3_sat_rtz(long3);
+int3 __ovld __cnfn convert_int3_rtp(long3);
+int3 __ovld __cnfn convert_int3_sat_rtp(long3);
+int3 __ovld __cnfn convert_int3_rtn(long3);
+int3 __ovld __cnfn convert_int3_sat_rtn(long3);
+int3 __ovld __cnfn convert_int3(long3);
+int3 __ovld __cnfn convert_int3_sat(long3);
+int3 __ovld __cnfn convert_int3_rte(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rte(ulong3);
+int3 __ovld __cnfn convert_int3_rtz(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtz(ulong3);
+int3 __ovld __cnfn convert_int3_rtp(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtp(ulong3);
+int3 __ovld __cnfn convert_int3_rtn(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtn(ulong3);
+int3 __ovld __cnfn convert_int3(ulong3);
+int3 __ovld __cnfn convert_int3_sat(ulong3);
+int3 __ovld __cnfn convert_int3_rte(float3);
+int3 __ovld __cnfn convert_int3_sat_rte(float3);
+int3 __ovld __cnfn convert_int3_rtz(float3);
+int3 __ovld __cnfn convert_int3_sat_rtz(float3);
+int3 __ovld __cnfn convert_int3_rtp(float3);
+int3 __ovld __cnfn convert_int3_sat_rtp(float3);
+int3 __ovld __cnfn convert_int3_rtn(float3);
+int3 __ovld __cnfn convert_int3_sat_rtn(float3);
+int3 __ovld __cnfn convert_int3(float3);
+int3 __ovld __cnfn convert_int3_sat(float3);
+uint3 __ovld __cnfn convert_uint3_rte(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(char3);
+uint3 __ovld __cnfn convert_uint3_rtz(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(char3);
+uint3 __ovld __cnfn convert_uint3_rtp(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(char3);
+uint3 __ovld __cnfn convert_uint3_rtn(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(char3);
+uint3 __ovld __cnfn convert_uint3(char3);
+uint3 __ovld __cnfn convert_uint3_sat(char3);
+uint3 __ovld __cnfn convert_uint3_rte(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtz(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtp(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtn(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(uchar3);
+uint3 __ovld __cnfn convert_uint3(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat(uchar3);
+uint3 __ovld __cnfn convert_uint3_rte(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(short3);
+uint3 __ovld __cnfn convert_uint3_rtz(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(short3);
+uint3 __ovld __cnfn convert_uint3_rtp(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(short3);
+uint3 __ovld __cnfn convert_uint3_rtn(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(short3);
+uint3 __ovld __cnfn convert_uint3(short3);
+uint3 __ovld __cnfn convert_uint3_sat(short3);
+uint3 __ovld __cnfn convert_uint3_rte(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtz(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtp(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtn(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(ushort3);
+uint3 __ovld __cnfn convert_uint3(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat(ushort3);
+uint3 __ovld __cnfn convert_uint3_rte(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(int3);
+uint3 __ovld __cnfn convert_uint3_rtz(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(int3);
+uint3 __ovld __cnfn convert_uint3_rtp(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(int3);
+uint3 __ovld __cnfn convert_uint3_rtn(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(int3);
+uint3 __ovld __cnfn convert_uint3(int3);
+uint3 __ovld __cnfn convert_uint3_sat(int3);
+uint3 __ovld __cnfn convert_uint3_rte(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(uint3);
+uint3 __ovld __cnfn convert_uint3_rtz(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(uint3);
+uint3 __ovld __cnfn convert_uint3_rtp(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(uint3);
+uint3 __ovld __cnfn convert_uint3_rtn(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(uint3);
+uint3 __ovld __cnfn convert_uint3(uint3);
+uint3 __ovld __cnfn convert_uint3_sat(uint3);
+uint3 __ovld __cnfn convert_uint3_rte(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(long3);
+uint3 __ovld __cnfn convert_uint3_rtz(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(long3);
+uint3 __ovld __cnfn convert_uint3_rtp(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(long3);
+uint3 __ovld __cnfn convert_uint3_rtn(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(long3);
+uint3 __ovld __cnfn convert_uint3(long3);
+uint3 __ovld __cnfn convert_uint3_sat(long3);
+uint3 __ovld __cnfn convert_uint3_rte(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtz(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtp(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtn(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(ulong3);
+uint3 __ovld __cnfn convert_uint3(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat(ulong3);
+uint3 __ovld __cnfn convert_uint3_rte(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(float3);
+uint3 __ovld __cnfn convert_uint3_rtz(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(float3);
+uint3 __ovld __cnfn convert_uint3_rtp(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(float3);
+uint3 __ovld __cnfn convert_uint3_rtn(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(float3);
+uint3 __ovld __cnfn convert_uint3(float3);
+uint3 __ovld __cnfn convert_uint3_sat(float3);
+long3 __ovld __cnfn convert_long3_rte(char3);
+long3 __ovld __cnfn convert_long3_sat_rte(char3);
+long3 __ovld __cnfn convert_long3_rtz(char3);
+long3 __ovld __cnfn convert_long3_sat_rtz(char3);
+long3 __ovld __cnfn convert_long3_rtp(char3);
+long3 __ovld __cnfn convert_long3_sat_rtp(char3);
+long3 __ovld __cnfn convert_long3_rtn(char3);
+long3 __ovld __cnfn convert_long3_sat_rtn(char3);
+long3 __ovld __cnfn convert_long3(char3);
+long3 __ovld __cnfn convert_long3_sat(char3);
+long3 __ovld __cnfn convert_long3_rte(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rte(uchar3);
+long3 __ovld __cnfn convert_long3_rtz(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtz(uchar3);
+long3 __ovld __cnfn convert_long3_rtp(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtp(uchar3);
+long3 __ovld __cnfn convert_long3_rtn(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtn(uchar3);
+long3 __ovld __cnfn convert_long3(uchar3);
+long3 __ovld __cnfn convert_long3_sat(uchar3);
+long3 __ovld __cnfn convert_long3_rte(short3);
+long3 __ovld __cnfn convert_long3_sat_rte(short3);
+long3 __ovld __cnfn convert_long3_rtz(short3);
+long3 __ovld __cnfn convert_long3_sat_rtz(short3);
+long3 __ovld __cnfn convert_long3_rtp(short3);
+long3 __ovld __cnfn convert_long3_sat_rtp(short3);
+long3 __ovld __cnfn convert_long3_rtn(short3);
+long3 __ovld __cnfn convert_long3_sat_rtn(short3);
+long3 __ovld __cnfn convert_long3(short3);
+long3 __ovld __cnfn convert_long3_sat(short3);
+long3 __ovld __cnfn convert_long3_rte(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rte(ushort3);
+long3 __ovld __cnfn convert_long3_rtz(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtz(ushort3);
+long3 __ovld __cnfn convert_long3_rtp(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtp(ushort3);
+long3 __ovld __cnfn convert_long3_rtn(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtn(ushort3);
+long3 __ovld __cnfn convert_long3(ushort3);
+long3 __ovld __cnfn convert_long3_sat(ushort3);
+long3 __ovld __cnfn convert_long3_rte(int3);
+long3 __ovld __cnfn convert_long3_sat_rte(int3);
+long3 __ovld __cnfn convert_long3_rtz(int3);
+long3 __ovld __cnfn convert_long3_sat_rtz(int3);
+long3 __ovld __cnfn convert_long3_rtp(int3);
+long3 __ovld __cnfn convert_long3_sat_rtp(int3);
+long3 __ovld __cnfn convert_long3_rtn(int3);
+long3 __ovld __cnfn convert_long3_sat_rtn(int3);
+long3 __ovld __cnfn convert_long3(int3);
+long3 __ovld __cnfn convert_long3_sat(int3);
+long3 __ovld __cnfn convert_long3_rte(uint3);
+long3 __ovld __cnfn convert_long3_sat_rte(uint3);
+long3 __ovld __cnfn convert_long3_rtz(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtz(uint3);
+long3 __ovld __cnfn convert_long3_rtp(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtp(uint3);
+long3 __ovld __cnfn convert_long3_rtn(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtn(uint3);
+long3 __ovld __cnfn convert_long3(uint3);
+long3 __ovld __cnfn convert_long3_sat(uint3);
+long3 __ovld __cnfn convert_long3_rte(long3);
+long3 __ovld __cnfn convert_long3_sat_rte(long3);
+long3 __ovld __cnfn convert_long3_rtz(long3);
+long3 __ovld __cnfn convert_long3_sat_rtz(long3);
+long3 __ovld __cnfn convert_long3_rtp(long3);
+long3 __ovld __cnfn convert_long3_sat_rtp(long3);
+long3 __ovld __cnfn convert_long3_rtn(long3);
+long3 __ovld __cnfn convert_long3_sat_rtn(long3);
+long3 __ovld __cnfn convert_long3(long3);
+long3 __ovld __cnfn convert_long3_sat(long3);
+long3 __ovld __cnfn convert_long3_rte(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rte(ulong3);
+long3 __ovld __cnfn convert_long3_rtz(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtz(ulong3);
+long3 __ovld __cnfn convert_long3_rtp(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtp(ulong3);
+long3 __ovld __cnfn convert_long3_rtn(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtn(ulong3);
+long3 __ovld __cnfn convert_long3(ulong3);
+long3 __ovld __cnfn convert_long3_sat(ulong3);
+long3 __ovld __cnfn convert_long3_rte(float3);
+long3 __ovld __cnfn convert_long3_sat_rte(float3);
+long3 __ovld __cnfn convert_long3_rtz(float3);
+long3 __ovld __cnfn convert_long3_sat_rtz(float3);
+long3 __ovld __cnfn convert_long3_rtp(float3);
+long3 __ovld __cnfn convert_long3_sat_rtp(float3);
+long3 __ovld __cnfn convert_long3_rtn(float3);
+long3 __ovld __cnfn convert_long3_sat_rtn(float3);
+long3 __ovld __cnfn convert_long3(float3);
+long3 __ovld __cnfn convert_long3_sat(float3);
+ulong3 __ovld __cnfn convert_ulong3_rte(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(char3);
+ulong3 __ovld __cnfn convert_ulong3(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat(char3);
+ulong3 __ovld __cnfn convert_ulong3_rte(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uchar3);
+ulong3 __ovld __cnfn convert_ulong3(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rte(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(short3);
+ulong3 __ovld __cnfn convert_ulong3(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat(short3);
+ulong3 __ovld __cnfn convert_ulong3_rte(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ushort3);
+ulong3 __ovld __cnfn convert_ulong3(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rte(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(int3);
+ulong3 __ovld __cnfn convert_ulong3(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat(int3);
+ulong3 __ovld __cnfn convert_ulong3_rte(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uint3);
+ulong3 __ovld __cnfn convert_ulong3(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rte(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(long3);
+ulong3 __ovld __cnfn convert_ulong3(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat(long3);
+ulong3 __ovld __cnfn convert_ulong3_rte(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ulong3);
+ulong3 __ovld __cnfn convert_ulong3(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rte(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(float3);
+ulong3 __ovld __cnfn convert_ulong3(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat(float3);
+float3 __ovld __cnfn convert_float3_rte(char3);
+float3 __ovld __cnfn convert_float3_rtz(char3);
+float3 __ovld __cnfn convert_float3_rtp(char3);
+float3 __ovld __cnfn convert_float3_rtn(char3);
+float3 __ovld __cnfn convert_float3(char3);
+float3 __ovld __cnfn convert_float3_rte(uchar3);
+float3 __ovld __cnfn convert_float3_rtz(uchar3);
+float3 __ovld __cnfn convert_float3_rtp(uchar3);
+float3 __ovld __cnfn convert_float3_rtn(uchar3);
+float3 __ovld __cnfn convert_float3(uchar3);
+float3 __ovld __cnfn convert_float3_rte(short3);
+float3 __ovld __cnfn convert_float3_rtz(short3);
+float3 __ovld __cnfn convert_float3_rtp(short3);
+float3 __ovld __cnfn convert_float3_rtn(short3);
+float3 __ovld __cnfn convert_float3(short3);
+float3 __ovld __cnfn convert_float3_rte(ushort3);
+float3 __ovld __cnfn convert_float3_rtz(ushort3);
+float3 __ovld __cnfn convert_float3_rtp(ushort3);
+float3 __ovld __cnfn convert_float3_rtn(ushort3);
+float3 __ovld __cnfn convert_float3(ushort3);
+float3 __ovld __cnfn convert_float3_rte(int3);
+float3 __ovld __cnfn convert_float3_rtz(int3);
+float3 __ovld __cnfn convert_float3_rtp(int3);
+float3 __ovld __cnfn convert_float3_rtn(int3);
+float3 __ovld __cnfn convert_float3(int3);
+float3 __ovld __cnfn convert_float3_rte(uint3);
+float3 __ovld __cnfn convert_float3_rtz(uint3);
+float3 __ovld __cnfn convert_float3_rtp(uint3);
+float3 __ovld __cnfn convert_float3_rtn(uint3);
+float3 __ovld __cnfn convert_float3(uint3);
+float3 __ovld __cnfn convert_float3_rte(long3);
+float3 __ovld __cnfn convert_float3_rtz(long3);
+float3 __ovld __cnfn convert_float3_rtp(long3);
+float3 __ovld __cnfn convert_float3_rtn(long3);
+float3 __ovld __cnfn convert_float3(long3);
+float3 __ovld __cnfn convert_float3_rte(ulong3);
+float3 __ovld __cnfn convert_float3_rtz(ulong3);
+float3 __ovld __cnfn convert_float3_rtp(ulong3);
+float3 __ovld __cnfn convert_float3_rtn(ulong3);
+float3 __ovld __cnfn convert_float3(ulong3);
+float3 __ovld __cnfn convert_float3_rte(float3);
+float3 __ovld __cnfn convert_float3_rtz(float3);
+float3 __ovld __cnfn convert_float3_rtp(float3);
+float3 __ovld __cnfn convert_float3_rtn(float3);
+float3 __ovld __cnfn convert_float3(float3);
+char4 __ovld __cnfn convert_char4_rte(char4);
+char4 __ovld __cnfn convert_char4_sat_rte(char4);
+char4 __ovld __cnfn convert_char4_rtz(char4);
+char4 __ovld __cnfn convert_char4_sat_rtz(char4);
+char4 __ovld __cnfn convert_char4_rtp(char4);
+char4 __ovld __cnfn convert_char4_sat_rtp(char4);
+char4 __ovld __cnfn convert_char4_rtn(char4);
+char4 __ovld __cnfn convert_char4_sat_rtn(char4);
+char4 __ovld __cnfn convert_char4(char4);
+char4 __ovld __cnfn convert_char4_sat(char4);
+char4 __ovld __cnfn convert_char4_rte(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rte(uchar4);
+char4 __ovld __cnfn convert_char4_rtz(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtz(uchar4);
+char4 __ovld __cnfn convert_char4_rtp(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtp(uchar4);
+char4 __ovld __cnfn convert_char4_rtn(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtn(uchar4);
+char4 __ovld __cnfn convert_char4(uchar4);
+char4 __ovld __cnfn convert_char4_sat(uchar4);
+char4 __ovld __cnfn convert_char4_rte(short4);
+char4 __ovld __cnfn convert_char4_sat_rte(short4);
+char4 __ovld __cnfn convert_char4_rtz(short4);
+char4 __ovld __cnfn convert_char4_sat_rtz(short4);
+char4 __ovld __cnfn convert_char4_rtp(short4);
+char4 __ovld __cnfn convert_char4_sat_rtp(short4);
+char4 __ovld __cnfn convert_char4_rtn(short4);
+char4 __ovld __cnfn convert_char4_sat_rtn(short4);
+char4 __ovld __cnfn convert_char4(short4);
+char4 __ovld __cnfn convert_char4_sat(short4);
+char4 __ovld __cnfn convert_char4_rte(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rte(ushort4);
+char4 __ovld __cnfn convert_char4_rtz(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtz(ushort4);
+char4 __ovld __cnfn convert_char4_rtp(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtp(ushort4);
+char4 __ovld __cnfn convert_char4_rtn(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtn(ushort4);
+char4 __ovld __cnfn convert_char4(ushort4);
+char4 __ovld __cnfn convert_char4_sat(ushort4);
+char4 __ovld __cnfn convert_char4_rte(int4);
+char4 __ovld __cnfn convert_char4_sat_rte(int4);
+char4 __ovld __cnfn convert_char4_rtz(int4);
+char4 __ovld __cnfn convert_char4_sat_rtz(int4);
+char4 __ovld __cnfn convert_char4_rtp(int4);
+char4 __ovld __cnfn convert_char4_sat_rtp(int4);
+char4 __ovld __cnfn convert_char4_rtn(int4);
+char4 __ovld __cnfn convert_char4_sat_rtn(int4);
+char4 __ovld __cnfn convert_char4(int4);
+char4 __ovld __cnfn convert_char4_sat(int4);
+char4 __ovld __cnfn convert_char4_rte(uint4);
+char4 __ovld __cnfn convert_char4_sat_rte(uint4);
+char4 __ovld __cnfn convert_char4_rtz(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtz(uint4);
+char4 __ovld __cnfn convert_char4_rtp(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtp(uint4);
+char4 __ovld __cnfn convert_char4_rtn(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtn(uint4);
+char4 __ovld __cnfn convert_char4(uint4);
+char4 __ovld __cnfn convert_char4_sat(uint4);
+char4 __ovld __cnfn convert_char4_rte(long4);
+char4 __ovld __cnfn convert_char4_sat_rte(long4);
+char4 __ovld __cnfn convert_char4_rtz(long4);
+char4 __ovld __cnfn convert_char4_sat_rtz(long4);
+char4 __ovld __cnfn convert_char4_rtp(long4);
+char4 __ovld __cnfn convert_char4_sat_rtp(long4);
+char4 __ovld __cnfn convert_char4_rtn(long4);
+char4 __ovld __cnfn convert_char4_sat_rtn(long4);
+char4 __ovld __cnfn convert_char4(long4);
+char4 __ovld __cnfn convert_char4_sat(long4);
+char4 __ovld __cnfn convert_char4_rte(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rte(ulong4);
+char4 __ovld __cnfn convert_char4_rtz(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtz(ulong4);
+char4 __ovld __cnfn convert_char4_rtp(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtp(ulong4);
+char4 __ovld __cnfn convert_char4_rtn(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtn(ulong4);
+char4 __ovld __cnfn convert_char4(ulong4);
+char4 __ovld __cnfn convert_char4_sat(ulong4);
+char4 __ovld __cnfn convert_char4_rte(float4);
+char4 __ovld __cnfn convert_char4_sat_rte(float4);
+char4 __ovld __cnfn convert_char4_rtz(float4);
+char4 __ovld __cnfn convert_char4_sat_rtz(float4);
+char4 __ovld __cnfn convert_char4_rtp(float4);
+char4 __ovld __cnfn convert_char4_sat_rtp(float4);
+char4 __ovld __cnfn convert_char4_rtn(float4);
+char4 __ovld __cnfn convert_char4_sat_rtn(float4);
+char4 __ovld __cnfn convert_char4(float4);
+char4 __ovld __cnfn convert_char4_sat(float4);
+uchar4 __ovld __cnfn convert_uchar4_rte(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(char4);
+uchar4 __ovld __cnfn convert_uchar4(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat(char4);
+uchar4 __ovld __cnfn convert_uchar4_rte(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uchar4);
+uchar4 __ovld __cnfn convert_uchar4(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rte(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(short4);
+uchar4 __ovld __cnfn convert_uchar4(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat(short4);
+uchar4 __ovld __cnfn convert_uchar4_rte(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ushort4);
+uchar4 __ovld __cnfn convert_uchar4(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rte(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(int4);
+uchar4 __ovld __cnfn convert_uchar4(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat(int4);
+uchar4 __ovld __cnfn convert_uchar4_rte(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uint4);
+uchar4 __ovld __cnfn convert_uchar4(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rte(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(long4);
+uchar4 __ovld __cnfn convert_uchar4(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat(long4);
+uchar4 __ovld __cnfn convert_uchar4_rte(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ulong4);
+uchar4 __ovld __cnfn convert_uchar4(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rte(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(float4);
+uchar4 __ovld __cnfn convert_uchar4(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat(float4);
+short4 __ovld __cnfn convert_short4_rte(char4);
+short4 __ovld __cnfn convert_short4_sat_rte(char4);
+short4 __ovld __cnfn convert_short4_rtz(char4);
+short4 __ovld __cnfn convert_short4_sat_rtz(char4);
+short4 __ovld __cnfn convert_short4_rtp(char4);
+short4 __ovld __cnfn convert_short4_sat_rtp(char4);
+short4 __ovld __cnfn convert_short4_rtn(char4);
+short4 __ovld __cnfn convert_short4_sat_rtn(char4);
+short4 __ovld __cnfn convert_short4(char4);
+short4 __ovld __cnfn convert_short4_sat(char4);
+short4 __ovld __cnfn convert_short4_rte(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rte(uchar4);
+short4 __ovld __cnfn convert_short4_rtz(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtz(uchar4);
+short4 __ovld __cnfn convert_short4_rtp(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtp(uchar4);
+short4 __ovld __cnfn convert_short4_rtn(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtn(uchar4);
+short4 __ovld __cnfn convert_short4(uchar4);
+short4 __ovld __cnfn convert_short4_sat(uchar4);
+short4 __ovld __cnfn convert_short4_rte(short4);
+short4 __ovld __cnfn convert_short4_sat_rte(short4);
+short4 __ovld __cnfn convert_short4_rtz(short4);
+short4 __ovld __cnfn convert_short4_sat_rtz(short4);
+short4 __ovld __cnfn convert_short4_rtp(short4);
+short4 __ovld __cnfn convert_short4_sat_rtp(short4);
+short4 __ovld __cnfn convert_short4_rtn(short4);
+short4 __ovld __cnfn convert_short4_sat_rtn(short4);
+short4 __ovld __cnfn convert_short4(short4);
+short4 __ovld __cnfn convert_short4_sat(short4);
+short4 __ovld __cnfn convert_short4_rte(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rte(ushort4);
+short4 __ovld __cnfn convert_short4_rtz(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtz(ushort4);
+short4 __ovld __cnfn convert_short4_rtp(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtp(ushort4);
+short4 __ovld __cnfn convert_short4_rtn(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtn(ushort4);
+short4 __ovld __cnfn convert_short4(ushort4);
+short4 __ovld __cnfn convert_short4_sat(ushort4);
+short4 __ovld __cnfn convert_short4_rte(int4);
+short4 __ovld __cnfn convert_short4_sat_rte(int4);
+short4 __ovld __cnfn convert_short4_rtz(int4);
+short4 __ovld __cnfn convert_short4_sat_rtz(int4);
+short4 __ovld __cnfn convert_short4_rtp(int4);
+short4 __ovld __cnfn convert_short4_sat_rtp(int4);
+short4 __ovld __cnfn convert_short4_rtn(int4);
+short4 __ovld __cnfn convert_short4_sat_rtn(int4);
+short4 __ovld __cnfn convert_short4(int4);
+short4 __ovld __cnfn convert_short4_sat(int4);
+short4 __ovld __cnfn convert_short4_rte(uint4);
+short4 __ovld __cnfn convert_short4_sat_rte(uint4);
+short4 __ovld __cnfn convert_short4_rtz(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtz(uint4);
+short4 __ovld __cnfn convert_short4_rtp(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtp(uint4);
+short4 __ovld __cnfn convert_short4_rtn(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtn(uint4);
+short4 __ovld __cnfn convert_short4(uint4);
+short4 __ovld __cnfn convert_short4_sat(uint4);
+short4 __ovld __cnfn convert_short4_rte(long4);
+short4 __ovld __cnfn convert_short4_sat_rte(long4);
+short4 __ovld __cnfn convert_short4_rtz(long4);
+short4 __ovld __cnfn convert_short4_sat_rtz(long4);
+short4 __ovld __cnfn convert_short4_rtp(long4);
+short4 __ovld __cnfn convert_short4_sat_rtp(long4);
+short4 __ovld __cnfn convert_short4_rtn(long4);
+short4 __ovld __cnfn convert_short4_sat_rtn(long4);
+short4 __ovld __cnfn convert_short4(long4);
+short4 __ovld __cnfn convert_short4_sat(long4);
+short4 __ovld __cnfn convert_short4_rte(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rte(ulong4);
+short4 __ovld __cnfn convert_short4_rtz(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtz(ulong4);
+short4 __ovld __cnfn convert_short4_rtp(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtp(ulong4);
+short4 __ovld __cnfn convert_short4_rtn(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtn(ulong4);
+short4 __ovld __cnfn convert_short4(ulong4);
+short4 __ovld __cnfn convert_short4_sat(ulong4);
+short4 __ovld __cnfn convert_short4_rte(float4);
+short4 __ovld __cnfn convert_short4_sat_rte(float4);
+short4 __ovld __cnfn convert_short4_rtz(float4);
+short4 __ovld __cnfn convert_short4_sat_rtz(float4);
+short4 __ovld __cnfn convert_short4_rtp(float4);
+short4 __ovld __cnfn convert_short4_sat_rtp(float4);
+short4 __ovld __cnfn convert_short4_rtn(float4);
+short4 __ovld __cnfn convert_short4_sat_rtn(float4);
+short4 __ovld __cnfn convert_short4(float4);
+short4 __ovld __cnfn convert_short4_sat(float4);
+ushort4 __ovld __cnfn convert_ushort4_rte(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(char4);
+ushort4 __ovld __cnfn convert_ushort4(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat(char4);
+ushort4 __ovld __cnfn convert_ushort4_rte(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uchar4);
+ushort4 __ovld __cnfn convert_ushort4(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rte(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(short4);
+ushort4 __ovld __cnfn convert_ushort4(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat(short4);
+ushort4 __ovld __cnfn convert_ushort4_rte(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ushort4);
+ushort4 __ovld __cnfn convert_ushort4(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rte(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(int4);
+ushort4 __ovld __cnfn convert_ushort4(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat(int4);
+ushort4 __ovld __cnfn convert_ushort4_rte(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uint4);
+ushort4 __ovld __cnfn convert_ushort4(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rte(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(long4);
+ushort4 __ovld __cnfn convert_ushort4(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat(long4);
+ushort4 __ovld __cnfn convert_ushort4_rte(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ulong4);
+ushort4 __ovld __cnfn convert_ushort4(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rte(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(float4);
+ushort4 __ovld __cnfn convert_ushort4(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat(float4);
+int4 __ovld __cnfn convert_int4_rte(char4);
+int4 __ovld __cnfn convert_int4_sat_rte(char4);
+int4 __ovld __cnfn convert_int4_rtz(char4);
+int4 __ovld __cnfn convert_int4_sat_rtz(char4);
+int4 __ovld __cnfn convert_int4_rtp(char4);
+int4 __ovld __cnfn convert_int4_sat_rtp(char4);
+int4 __ovld __cnfn convert_int4_rtn(char4);
+int4 __ovld __cnfn convert_int4_sat_rtn(char4);
+int4 __ovld __cnfn convert_int4(char4);
+int4 __ovld __cnfn convert_int4_sat(char4);
+int4 __ovld __cnfn convert_int4_rte(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rte(uchar4);
+int4 __ovld __cnfn convert_int4_rtz(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtz(uchar4);
+int4 __ovld __cnfn convert_int4_rtp(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtp(uchar4);
+int4 __ovld __cnfn convert_int4_rtn(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtn(uchar4);
+int4 __ovld __cnfn convert_int4(uchar4);
+int4 __ovld __cnfn convert_int4_sat(uchar4);
+int4 __ovld __cnfn convert_int4_rte(short4);
+int4 __ovld __cnfn convert_int4_sat_rte(short4);
+int4 __ovld __cnfn convert_int4_rtz(short4);
+int4 __ovld __cnfn convert_int4_sat_rtz(short4);
+int4 __ovld __cnfn convert_int4_rtp(short4);
+int4 __ovld __cnfn convert_int4_sat_rtp(short4);
+int4 __ovld __cnfn convert_int4_rtn(short4);
+int4 __ovld __cnfn convert_int4_sat_rtn(short4);
+int4 __ovld __cnfn convert_int4(short4);
+int4 __ovld __cnfn convert_int4_sat(short4);
+int4 __ovld __cnfn convert_int4_rte(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rte(ushort4);
+int4 __ovld __cnfn convert_int4_rtz(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtz(ushort4);
+int4 __ovld __cnfn convert_int4_rtp(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtp(ushort4);
+int4 __ovld __cnfn convert_int4_rtn(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtn(ushort4);
+int4 __ovld __cnfn convert_int4(ushort4);
+int4 __ovld __cnfn convert_int4_sat(ushort4);
+int4 __ovld __cnfn convert_int4_rte(int4);
+int4 __ovld __cnfn convert_int4_sat_rte(int4);
+int4 __ovld __cnfn convert_int4_rtz(int4);
+int4 __ovld __cnfn convert_int4_sat_rtz(int4);
+int4 __ovld __cnfn convert_int4_rtp(int4);
+int4 __ovld __cnfn convert_int4_sat_rtp(int4);
+int4 __ovld __cnfn convert_int4_rtn(int4);
+int4 __ovld __cnfn convert_int4_sat_rtn(int4);
+int4 __ovld __cnfn convert_int4(int4);
+int4 __ovld __cnfn convert_int4_sat(int4);
+int4 __ovld __cnfn convert_int4_rte(uint4);
+int4 __ovld __cnfn convert_int4_sat_rte(uint4);
+int4 __ovld __cnfn convert_int4_rtz(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtz(uint4);
+int4 __ovld __cnfn convert_int4_rtp(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtp(uint4);
+int4 __ovld __cnfn convert_int4_rtn(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtn(uint4);
+int4 __ovld __cnfn convert_int4(uint4);
+int4 __ovld __cnfn convert_int4_sat(uint4);
+int4 __ovld __cnfn convert_int4_rte(long4);
+int4 __ovld __cnfn convert_int4_sat_rte(long4);
+int4 __ovld __cnfn convert_int4_rtz(long4);
+int4 __ovld __cnfn convert_int4_sat_rtz(long4);
+int4 __ovld __cnfn convert_int4_rtp(long4);
+int4 __ovld __cnfn convert_int4_sat_rtp(long4);
+int4 __ovld __cnfn convert_int4_rtn(long4);
+int4 __ovld __cnfn convert_int4_sat_rtn(long4);
+int4 __ovld __cnfn convert_int4(long4);
+int4 __ovld __cnfn convert_int4_sat(long4);
+int4 __ovld __cnfn convert_int4_rte(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rte(ulong4);
+int4 __ovld __cnfn convert_int4_rtz(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtz(ulong4);
+int4 __ovld __cnfn convert_int4_rtp(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtp(ulong4);
+int4 __ovld __cnfn convert_int4_rtn(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtn(ulong4);
+int4 __ovld __cnfn convert_int4(ulong4);
+int4 __ovld __cnfn convert_int4_sat(ulong4);
+int4 __ovld __cnfn convert_int4_rte(float4);
+int4 __ovld __cnfn convert_int4_sat_rte(float4);
+int4 __ovld __cnfn convert_int4_rtz(float4);
+int4 __ovld __cnfn convert_int4_sat_rtz(float4);
+int4 __ovld __cnfn convert_int4_rtp(float4);
+int4 __ovld __cnfn convert_int4_sat_rtp(float4);
+int4 __ovld __cnfn convert_int4_rtn(float4);
+int4 __ovld __cnfn convert_int4_sat_rtn(float4);
+int4 __ovld __cnfn convert_int4(float4);
+int4 __ovld __cnfn convert_int4_sat(float4);
+uint4 __ovld __cnfn convert_uint4_rte(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(char4);
+uint4 __ovld __cnfn convert_uint4_rtz(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(char4);
+uint4 __ovld __cnfn convert_uint4_rtp(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(char4);
+uint4 __ovld __cnfn convert_uint4_rtn(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(char4);
+uint4 __ovld __cnfn convert_uint4(char4);
+uint4 __ovld __cnfn convert_uint4_sat(char4);
+uint4 __ovld __cnfn convert_uint4_rte(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtz(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtp(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtn(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(uchar4);
+uint4 __ovld __cnfn convert_uint4(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat(uchar4);
+uint4 __ovld __cnfn convert_uint4_rte(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(short4);
+uint4 __ovld __cnfn convert_uint4_rtz(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(short4);
+uint4 __ovld __cnfn convert_uint4_rtp(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(short4);
+uint4 __ovld __cnfn convert_uint4_rtn(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(short4);
+uint4 __ovld __cnfn convert_uint4(short4);
+uint4 __ovld __cnfn convert_uint4_sat(short4);
+uint4 __ovld __cnfn convert_uint4_rte(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtz(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtp(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtn(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(ushort4);
+uint4 __ovld __cnfn convert_uint4(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat(ushort4);
+uint4 __ovld __cnfn convert_uint4_rte(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(int4);
+uint4 __ovld __cnfn convert_uint4_rtz(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(int4);
+uint4 __ovld __cnfn convert_uint4_rtp(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(int4);
+uint4 __ovld __cnfn convert_uint4_rtn(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(int4);
+uint4 __ovld __cnfn convert_uint4(int4);
+uint4 __ovld __cnfn convert_uint4_sat(int4);
+uint4 __ovld __cnfn convert_uint4_rte(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(uint4);
+uint4 __ovld __cnfn convert_uint4_rtz(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(uint4);
+uint4 __ovld __cnfn convert_uint4_rtp(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(uint4);
+uint4 __ovld __cnfn convert_uint4_rtn(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(uint4);
+uint4 __ovld __cnfn convert_uint4(uint4);
+uint4 __ovld __cnfn convert_uint4_sat(uint4);
+uint4 __ovld __cnfn convert_uint4_rte(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(long4);
+uint4 __ovld __cnfn convert_uint4_rtz(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(long4);
+uint4 __ovld __cnfn convert_uint4_rtp(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(long4);
+uint4 __ovld __cnfn convert_uint4_rtn(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(long4);
+uint4 __ovld __cnfn convert_uint4(long4);
+uint4 __ovld __cnfn convert_uint4_sat(long4);
+uint4 __ovld __cnfn convert_uint4_rte(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtz(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtp(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtn(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(ulong4);
+uint4 __ovld __cnfn convert_uint4(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat(ulong4);
+uint4 __ovld __cnfn convert_uint4_rte(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(float4);
+uint4 __ovld __cnfn convert_uint4_rtz(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(float4);
+uint4 __ovld __cnfn convert_uint4_rtp(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(float4);
+uint4 __ovld __cnfn convert_uint4_rtn(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(float4);
+uint4 __ovld __cnfn convert_uint4(float4);
+uint4 __ovld __cnfn convert_uint4_sat(float4);
+long4 __ovld __cnfn convert_long4_rte(char4);
+long4 __ovld __cnfn convert_long4_sat_rte(char4);
+long4 __ovld __cnfn convert_long4_rtz(char4);
+long4 __ovld __cnfn convert_long4_sat_rtz(char4);
+long4 __ovld __cnfn convert_long4_rtp(char4);
+long4 __ovld __cnfn convert_long4_sat_rtp(char4);
+long4 __ovld __cnfn convert_long4_rtn(char4);
+long4 __ovld __cnfn convert_long4_sat_rtn(char4);
+long4 __ovld __cnfn convert_long4(char4);
+long4 __ovld __cnfn convert_long4_sat(char4);
+long4 __ovld __cnfn convert_long4_rte(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rte(uchar4);
+long4 __ovld __cnfn convert_long4_rtz(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtz(uchar4);
+long4 __ovld __cnfn convert_long4_rtp(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtp(uchar4);
+long4 __ovld __cnfn convert_long4_rtn(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtn(uchar4);
+long4 __ovld __cnfn convert_long4(uchar4);
+long4 __ovld __cnfn convert_long4_sat(uchar4);
+long4 __ovld __cnfn convert_long4_rte(short4);
+long4 __ovld __cnfn convert_long4_sat_rte(short4);
+long4 __ovld __cnfn convert_long4_rtz(short4);
+long4 __ovld __cnfn convert_long4_sat_rtz(short4);
+long4 __ovld __cnfn convert_long4_rtp(short4);
+long4 __ovld __cnfn convert_long4_sat_rtp(short4);
+long4 __ovld __cnfn convert_long4_rtn(short4);
+long4 __ovld __cnfn convert_long4_sat_rtn(short4);
+long4 __ovld __cnfn convert_long4(short4);
+long4 __ovld __cnfn convert_long4_sat(short4);
+long4 __ovld __cnfn convert_long4_rte(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rte(ushort4);
+long4 __ovld __cnfn convert_long4_rtz(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtz(ushort4);
+long4 __ovld __cnfn convert_long4_rtp(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtp(ushort4);
+long4 __ovld __cnfn convert_long4_rtn(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtn(ushort4);
+long4 __ovld __cnfn convert_long4(ushort4);
+long4 __ovld __cnfn convert_long4_sat(ushort4);
+long4 __ovld __cnfn convert_long4_rte(int4);
+long4 __ovld __cnfn convert_long4_sat_rte(int4);
+long4 __ovld __cnfn convert_long4_rtz(int4);
+long4 __ovld __cnfn convert_long4_sat_rtz(int4);
+long4 __ovld __cnfn convert_long4_rtp(int4);
+long4 __ovld __cnfn convert_long4_sat_rtp(int4);
+long4 __ovld __cnfn convert_long4_rtn(int4);
+long4 __ovld __cnfn convert_long4_sat_rtn(int4);
+long4 __ovld __cnfn convert_long4(int4);
+long4 __ovld __cnfn convert_long4_sat(int4);
+long4 __ovld __cnfn convert_long4_rte(uint4);
+long4 __ovld __cnfn convert_long4_sat_rte(uint4);
+long4 __ovld __cnfn convert_long4_rtz(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtz(uint4);
+long4 __ovld __cnfn convert_long4_rtp(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtp(uint4);
+long4 __ovld __cnfn convert_long4_rtn(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtn(uint4);
+long4 __ovld __cnfn convert_long4(uint4);
+long4 __ovld __cnfn convert_long4_sat(uint4);
+long4 __ovld __cnfn convert_long4_rte(long4);
+long4 __ovld __cnfn convert_long4_sat_rte(long4);
+long4 __ovld __cnfn convert_long4_rtz(long4);
+long4 __ovld __cnfn convert_long4_sat_rtz(long4);
+long4 __ovld __cnfn convert_long4_rtp(long4);
+long4 __ovld __cnfn convert_long4_sat_rtp(long4);
+long4 __ovld __cnfn convert_long4_rtn(long4);
+long4 __ovld __cnfn convert_long4_sat_rtn(long4);
+long4 __ovld __cnfn convert_long4(long4);
+long4 __ovld __cnfn convert_long4_sat(long4);
+long4 __ovld __cnfn convert_long4_rte(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rte(ulong4);
+long4 __ovld __cnfn convert_long4_rtz(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtz(ulong4);
+long4 __ovld __cnfn convert_long4_rtp(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtp(ulong4);
+long4 __ovld __cnfn convert_long4_rtn(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtn(ulong4);
+long4 __ovld __cnfn convert_long4(ulong4);
+long4 __ovld __cnfn convert_long4_sat(ulong4);
+long4 __ovld __cnfn convert_long4_rte(float4);
+long4 __ovld __cnfn convert_long4_sat_rte(float4);
+long4 __ovld __cnfn convert_long4_rtz(float4);
+long4 __ovld __cnfn convert_long4_sat_rtz(float4);
+long4 __ovld __cnfn convert_long4_rtp(float4);
+long4 __ovld __cnfn convert_long4_sat_rtp(float4);
+long4 __ovld __cnfn convert_long4_rtn(float4);
+long4 __ovld __cnfn convert_long4_sat_rtn(float4);
+long4 __ovld __cnfn convert_long4(float4);
+long4 __ovld __cnfn convert_long4_sat(float4);
+ulong4 __ovld __cnfn convert_ulong4_rte(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(char4);
+ulong4 __ovld __cnfn convert_ulong4(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat(char4);
+ulong4 __ovld __cnfn convert_ulong4_rte(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uchar4);
+ulong4 __ovld __cnfn convert_ulong4(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rte(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(short4);
+ulong4 __ovld __cnfn convert_ulong4(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat(short4);
+ulong4 __ovld __cnfn convert_ulong4_rte(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ushort4);
+ulong4 __ovld __cnfn convert_ulong4(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rte(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(int4);
+ulong4 __ovld __cnfn convert_ulong4(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat(int4);
+ulong4 __ovld __cnfn convert_ulong4_rte(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uint4);
+ulong4 __ovld __cnfn convert_ulong4(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rte(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(long4);
+ulong4 __ovld __cnfn convert_ulong4(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat(long4);
+ulong4 __ovld __cnfn convert_ulong4_rte(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ulong4);
+ulong4 __ovld __cnfn convert_ulong4(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rte(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(float4);
+ulong4 __ovld __cnfn convert_ulong4(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat(float4);
+float4 __ovld __cnfn convert_float4_rte(char4);
+float4 __ovld __cnfn convert_float4_rtz(char4);
+float4 __ovld __cnfn convert_float4_rtp(char4);
+float4 __ovld __cnfn convert_float4_rtn(char4);
+float4 __ovld __cnfn convert_float4(char4);
+float4 __ovld __cnfn convert_float4_rte(uchar4);
+float4 __ovld __cnfn convert_float4_rtz(uchar4);
+float4 __ovld __cnfn convert_float4_rtp(uchar4);
+float4 __ovld __cnfn convert_float4_rtn(uchar4);
+float4 __ovld __cnfn convert_float4(uchar4);
+float4 __ovld __cnfn convert_float4_rte(short4);
+float4 __ovld __cnfn convert_float4_rtz(short4);
+float4 __ovld __cnfn convert_float4_rtp(short4);
+float4 __ovld __cnfn convert_float4_rtn(short4);
+float4 __ovld __cnfn convert_float4(short4);
+float4 __ovld __cnfn convert_float4_rte(ushort4);
+float4 __ovld __cnfn convert_float4_rtz(ushort4);
+float4 __ovld __cnfn convert_float4_rtp(ushort4);
+float4 __ovld __cnfn convert_float4_rtn(ushort4);
+float4 __ovld __cnfn convert_float4(ushort4);
+float4 __ovld __cnfn convert_float4_rte(int4);
+float4 __ovld __cnfn convert_float4_rtz(int4);
+float4 __ovld __cnfn convert_float4_rtp(int4);
+float4 __ovld __cnfn convert_float4_rtn(int4);
+float4 __ovld __cnfn convert_float4(int4);
+float4 __ovld __cnfn convert_float4_rte(uint4);
+float4 __ovld __cnfn convert_float4_rtz(uint4);
+float4 __ovld __cnfn convert_float4_rtp(uint4);
+float4 __ovld __cnfn convert_float4_rtn(uint4);
+float4 __ovld __cnfn convert_float4(uint4);
+float4 __ovld __cnfn convert_float4_rte(long4);
+float4 __ovld __cnfn convert_float4_rtz(long4);
+float4 __ovld __cnfn convert_float4_rtp(long4);
+float4 __ovld __cnfn convert_float4_rtn(long4);
+float4 __ovld __cnfn convert_float4(long4);
+float4 __ovld __cnfn convert_float4_rte(ulong4);
+float4 __ovld __cnfn convert_float4_rtz(ulong4);
+float4 __ovld __cnfn convert_float4_rtp(ulong4);
+float4 __ovld __cnfn convert_float4_rtn(ulong4);
+float4 __ovld __cnfn convert_float4(ulong4);
+float4 __ovld __cnfn convert_float4_rte(float4);
+float4 __ovld __cnfn convert_float4_rtz(float4);
+float4 __ovld __cnfn convert_float4_rtp(float4);
+float4 __ovld __cnfn convert_float4_rtn(float4);
+float4 __ovld __cnfn convert_float4(float4);
+char8 __ovld __cnfn convert_char8_rte(char8);
+char8 __ovld __cnfn convert_char8_sat_rte(char8);
+char8 __ovld __cnfn convert_char8_rtz(char8);
+char8 __ovld __cnfn convert_char8_sat_rtz(char8);
+char8 __ovld __cnfn convert_char8_rtp(char8);
+char8 __ovld __cnfn convert_char8_sat_rtp(char8);
+char8 __ovld __cnfn convert_char8_rtn(char8);
+char8 __ovld __cnfn convert_char8_sat_rtn(char8);
+char8 __ovld __cnfn convert_char8(char8);
+char8 __ovld __cnfn convert_char8_sat(char8);
+char8 __ovld __cnfn convert_char8_rte(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rte(uchar8);
+char8 __ovld __cnfn convert_char8_rtz(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtz(uchar8);
+char8 __ovld __cnfn convert_char8_rtp(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtp(uchar8);
+char8 __ovld __cnfn convert_char8_rtn(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtn(uchar8);
+char8 __ovld __cnfn convert_char8(uchar8);
+char8 __ovld __cnfn convert_char8_sat(uchar8);
+char8 __ovld __cnfn convert_char8_rte(short8);
+char8 __ovld __cnfn convert_char8_sat_rte(short8);
+char8 __ovld __cnfn convert_char8_rtz(short8);
+char8 __ovld __cnfn convert_char8_sat_rtz(short8);
+char8 __ovld __cnfn convert_char8_rtp(short8);
+char8 __ovld __cnfn convert_char8_sat_rtp(short8);
+char8 __ovld __cnfn convert_char8_rtn(short8);
+char8 __ovld __cnfn convert_char8_sat_rtn(short8);
+char8 __ovld __cnfn convert_char8(short8);
+char8 __ovld __cnfn convert_char8_sat(short8);
+char8 __ovld __cnfn convert_char8_rte(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rte(ushort8);
+char8 __ovld __cnfn convert_char8_rtz(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtz(ushort8);
+char8 __ovld __cnfn convert_char8_rtp(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtp(ushort8);
+char8 __ovld __cnfn convert_char8_rtn(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtn(ushort8);
+char8 __ovld __cnfn convert_char8(ushort8);
+char8 __ovld __cnfn convert_char8_sat(ushort8);
+char8 __ovld __cnfn convert_char8_rte(int8);
+char8 __ovld __cnfn convert_char8_sat_rte(int8);
+char8 __ovld __cnfn convert_char8_rtz(int8);
+char8 __ovld __cnfn convert_char8_sat_rtz(int8);
+char8 __ovld __cnfn convert_char8_rtp(int8);
+char8 __ovld __cnfn convert_char8_sat_rtp(int8);
+char8 __ovld __cnfn convert_char8_rtn(int8);
+char8 __ovld __cnfn convert_char8_sat_rtn(int8);
+char8 __ovld __cnfn convert_char8(int8);
+char8 __ovld __cnfn convert_char8_sat(int8);
+char8 __ovld __cnfn convert_char8_rte(uint8);
+char8 __ovld __cnfn convert_char8_sat_rte(uint8);
+char8 __ovld __cnfn convert_char8_rtz(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtz(uint8);
+char8 __ovld __cnfn convert_char8_rtp(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtp(uint8);
+char8 __ovld __cnfn convert_char8_rtn(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtn(uint8);
+char8 __ovld __cnfn convert_char8(uint8);
+char8 __ovld __cnfn convert_char8_sat(uint8);
+char8 __ovld __cnfn convert_char8_rte(long8);
+char8 __ovld __cnfn convert_char8_sat_rte(long8);
+char8 __ovld __cnfn convert_char8_rtz(long8);
+char8 __ovld __cnfn convert_char8_sat_rtz(long8);
+char8 __ovld __cnfn convert_char8_rtp(long8);
+char8 __ovld __cnfn convert_char8_sat_rtp(long8);
+char8 __ovld __cnfn convert_char8_rtn(long8);
+char8 __ovld __cnfn convert_char8_sat_rtn(long8);
+char8 __ovld __cnfn convert_char8(long8);
+char8 __ovld __cnfn convert_char8_sat(long8);
+char8 __ovld __cnfn convert_char8_rte(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rte(ulong8);
+char8 __ovld __cnfn convert_char8_rtz(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtz(ulong8);
+char8 __ovld __cnfn convert_char8_rtp(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtp(ulong8);
+char8 __ovld __cnfn convert_char8_rtn(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtn(ulong8);
+char8 __ovld __cnfn convert_char8(ulong8);
+char8 __ovld __cnfn convert_char8_sat(ulong8);
+char8 __ovld __cnfn convert_char8_rte(float8);
+char8 __ovld __cnfn convert_char8_sat_rte(float8);
+char8 __ovld __cnfn convert_char8_rtz(float8);
+char8 __ovld __cnfn convert_char8_sat_rtz(float8);
+char8 __ovld __cnfn convert_char8_rtp(float8);
+char8 __ovld __cnfn convert_char8_sat_rtp(float8);
+char8 __ovld __cnfn convert_char8_rtn(float8);
+char8 __ovld __cnfn convert_char8_sat_rtn(float8);
+char8 __ovld __cnfn convert_char8(float8);
+char8 __ovld __cnfn convert_char8_sat(float8);
+uchar8 __ovld __cnfn convert_uchar8_rte(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(char8);
+uchar8 __ovld __cnfn convert_uchar8(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat(char8);
+uchar8 __ovld __cnfn convert_uchar8_rte(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uchar8);
+uchar8 __ovld __cnfn convert_uchar8(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rte(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(short8);
+uchar8 __ovld __cnfn convert_uchar8(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat(short8);
+uchar8 __ovld __cnfn convert_uchar8_rte(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ushort8);
+uchar8 __ovld __cnfn convert_uchar8(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rte(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(int8);
+uchar8 __ovld __cnfn convert_uchar8(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat(int8);
+uchar8 __ovld __cnfn convert_uchar8_rte(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uint8);
+uchar8 __ovld __cnfn convert_uchar8(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rte(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(long8);
+uchar8 __ovld __cnfn convert_uchar8(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat(long8);
+uchar8 __ovld __cnfn convert_uchar8_rte(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ulong8);
+uchar8 __ovld __cnfn convert_uchar8(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rte(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(float8);
+uchar8 __ovld __cnfn convert_uchar8(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat(float8);
+short8 __ovld __cnfn convert_short8_rte(char8);
+short8 __ovld __cnfn convert_short8_sat_rte(char8);
+short8 __ovld __cnfn convert_short8_rtz(char8);
+short8 __ovld __cnfn convert_short8_sat_rtz(char8);
+short8 __ovld __cnfn convert_short8_rtp(char8);
+short8 __ovld __cnfn convert_short8_sat_rtp(char8);
+short8 __ovld __cnfn convert_short8_rtn(char8);
+short8 __ovld __cnfn convert_short8_sat_rtn(char8);
+short8 __ovld __cnfn convert_short8(char8);
+short8 __ovld __cnfn convert_short8_sat(char8);
+short8 __ovld __cnfn convert_short8_rte(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rte(uchar8);
+short8 __ovld __cnfn convert_short8_rtz(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtz(uchar8);
+short8 __ovld __cnfn convert_short8_rtp(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtp(uchar8);
+short8 __ovld __cnfn convert_short8_rtn(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtn(uchar8);
+short8 __ovld __cnfn convert_short8(uchar8);
+short8 __ovld __cnfn convert_short8_sat(uchar8);
+short8 __ovld __cnfn convert_short8_rte(short8);
+short8 __ovld __cnfn convert_short8_sat_rte(short8);
+short8 __ovld __cnfn convert_short8_rtz(short8);
+short8 __ovld __cnfn convert_short8_sat_rtz(short8);
+short8 __ovld __cnfn convert_short8_rtp(short8);
+short8 __ovld __cnfn convert_short8_sat_rtp(short8);
+short8 __ovld __cnfn convert_short8_rtn(short8);
+short8 __ovld __cnfn convert_short8_sat_rtn(short8);
+short8 __ovld __cnfn convert_short8(short8);
+short8 __ovld __cnfn convert_short8_sat(short8);
+short8 __ovld __cnfn convert_short8_rte(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rte(ushort8);
+short8 __ovld __cnfn convert_short8_rtz(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtz(ushort8);
+short8 __ovld __cnfn convert_short8_rtp(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtp(ushort8);
+short8 __ovld __cnfn convert_short8_rtn(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtn(ushort8);
+short8 __ovld __cnfn convert_short8(ushort8);
+short8 __ovld __cnfn convert_short8_sat(ushort8);
+short8 __ovld __cnfn convert_short8_rte(int8);
+short8 __ovld __cnfn convert_short8_sat_rte(int8);
+short8 __ovld __cnfn convert_short8_rtz(int8);
+short8 __ovld __cnfn convert_short8_sat_rtz(int8);
+short8 __ovld __cnfn convert_short8_rtp(int8);
+short8 __ovld __cnfn convert_short8_sat_rtp(int8);
+short8 __ovld __cnfn convert_short8_rtn(int8);
+short8 __ovld __cnfn convert_short8_sat_rtn(int8);
+short8 __ovld __cnfn convert_short8(int8);
+short8 __ovld __cnfn convert_short8_sat(int8);
+short8 __ovld __cnfn convert_short8_rte(uint8);
+short8 __ovld __cnfn convert_short8_sat_rte(uint8);
+short8 __ovld __cnfn convert_short8_rtz(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtz(uint8);
+short8 __ovld __cnfn convert_short8_rtp(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtp(uint8);
+short8 __ovld __cnfn convert_short8_rtn(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtn(uint8);
+short8 __ovld __cnfn convert_short8(uint8);
+short8 __ovld __cnfn convert_short8_sat(uint8);
+short8 __ovld __cnfn convert_short8_rte(long8);
+short8 __ovld __cnfn convert_short8_sat_rte(long8);
+short8 __ovld __cnfn convert_short8_rtz(long8);
+short8 __ovld __cnfn convert_short8_sat_rtz(long8);
+short8 __ovld __cnfn convert_short8_rtp(long8);
+short8 __ovld __cnfn convert_short8_sat_rtp(long8);
+short8 __ovld __cnfn convert_short8_rtn(long8);
+short8 __ovld __cnfn convert_short8_sat_rtn(long8);
+short8 __ovld __cnfn convert_short8(long8);
+short8 __ovld __cnfn convert_short8_sat(long8);
+short8 __ovld __cnfn convert_short8_rte(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rte(ulong8);
+short8 __ovld __cnfn convert_short8_rtz(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtz(ulong8);
+short8 __ovld __cnfn convert_short8_rtp(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtp(ulong8);
+short8 __ovld __cnfn convert_short8_rtn(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtn(ulong8);
+short8 __ovld __cnfn convert_short8(ulong8);
+short8 __ovld __cnfn convert_short8_sat(ulong8);
+short8 __ovld __cnfn convert_short8_rte(float8);
+short8 __ovld __cnfn convert_short8_sat_rte(float8);
+short8 __ovld __cnfn convert_short8_rtz(float8);
+short8 __ovld __cnfn convert_short8_sat_rtz(float8);
+short8 __ovld __cnfn convert_short8_rtp(float8);
+short8 __ovld __cnfn convert_short8_sat_rtp(float8);
+short8 __ovld __cnfn convert_short8_rtn(float8);
+short8 __ovld __cnfn convert_short8_sat_rtn(float8);
+short8 __ovld __cnfn convert_short8(float8);
+short8 __ovld __cnfn convert_short8_sat(float8);
+ushort8 __ovld __cnfn convert_ushort8_rte(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(char8);
+ushort8 __ovld __cnfn convert_ushort8(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat(char8);
+ushort8 __ovld __cnfn convert_ushort8_rte(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uchar8);
+ushort8 __ovld __cnfn convert_ushort8(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rte(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(short8);
+ushort8 __ovld __cnfn convert_ushort8(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat(short8);
+ushort8 __ovld __cnfn convert_ushort8_rte(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ushort8);
+ushort8 __ovld __cnfn convert_ushort8(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rte(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(int8);
+ushort8 __ovld __cnfn convert_ushort8(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat(int8);
+ushort8 __ovld __cnfn convert_ushort8_rte(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uint8);
+ushort8 __ovld __cnfn convert_ushort8(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rte(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(long8);
+ushort8 __ovld __cnfn convert_ushort8(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat(long8);
+ushort8 __ovld __cnfn convert_ushort8_rte(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ulong8);
+ushort8 __ovld __cnfn convert_ushort8(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rte(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(float8);
+ushort8 __ovld __cnfn convert_ushort8(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat(float8);
+int8 __ovld __cnfn convert_int8_rte(char8);
+int8 __ovld __cnfn convert_int8_sat_rte(char8);
+int8 __ovld __cnfn convert_int8_rtz(char8);
+int8 __ovld __cnfn convert_int8_sat_rtz(char8);
+int8 __ovld __cnfn convert_int8_rtp(char8);
+int8 __ovld __cnfn convert_int8_sat_rtp(char8);
+int8 __ovld __cnfn convert_int8_rtn(char8);
+int8 __ovld __cnfn convert_int8_sat_rtn(char8);
+int8 __ovld __cnfn convert_int8(char8);
+int8 __ovld __cnfn convert_int8_sat(char8);
+int8 __ovld __cnfn convert_int8_rte(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rte(uchar8);
+int8 __ovld __cnfn convert_int8_rtz(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtz(uchar8);
+int8 __ovld __cnfn convert_int8_rtp(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtp(uchar8);
+int8 __ovld __cnfn convert_int8_rtn(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtn(uchar8);
+int8 __ovld __cnfn convert_int8(uchar8);
+int8 __ovld __cnfn convert_int8_sat(uchar8);
+int8 __ovld __cnfn convert_int8_rte(short8);
+int8 __ovld __cnfn convert_int8_sat_rte(short8);
+int8 __ovld __cnfn convert_int8_rtz(short8);
+int8 __ovld __cnfn convert_int8_sat_rtz(short8);
+int8 __ovld __cnfn convert_int8_rtp(short8);
+int8 __ovld __cnfn convert_int8_sat_rtp(short8);
+int8 __ovld __cnfn convert_int8_rtn(short8);
+int8 __ovld __cnfn convert_int8_sat_rtn(short8);
+int8 __ovld __cnfn convert_int8(short8);
+int8 __ovld __cnfn convert_int8_sat(short8);
+int8 __ovld __cnfn convert_int8_rte(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rte(ushort8);
+int8 __ovld __cnfn convert_int8_rtz(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtz(ushort8);
+int8 __ovld __cnfn convert_int8_rtp(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtp(ushort8);
+int8 __ovld __cnfn convert_int8_rtn(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtn(ushort8);
+int8 __ovld __cnfn convert_int8(ushort8);
+int8 __ovld __cnfn convert_int8_sat(ushort8);
+int8 __ovld __cnfn convert_int8_rte(int8);
+int8 __ovld __cnfn convert_int8_sat_rte(int8);
+int8 __ovld __cnfn convert_int8_rtz(int8);
+int8 __ovld __cnfn convert_int8_sat_rtz(int8);
+int8 __ovld __cnfn convert_int8_rtp(int8);
+int8 __ovld __cnfn convert_int8_sat_rtp(int8);
+int8 __ovld __cnfn convert_int8_rtn(int8);
+int8 __ovld __cnfn convert_int8_sat_rtn(int8);
+int8 __ovld __cnfn convert_int8(int8);
+int8 __ovld __cnfn convert_int8_sat(int8);
+int8 __ovld __cnfn convert_int8_rte(uint8);
+int8 __ovld __cnfn convert_int8_sat_rte(uint8);
+int8 __ovld __cnfn convert_int8_rtz(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtz(uint8);
+int8 __ovld __cnfn convert_int8_rtp(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtp(uint8);
+int8 __ovld __cnfn convert_int8_rtn(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtn(uint8);
+int8 __ovld __cnfn convert_int8(uint8);
+int8 __ovld __cnfn convert_int8_sat(uint8);
+int8 __ovld __cnfn convert_int8_rte(long8);
+int8 __ovld __cnfn convert_int8_sat_rte(long8);
+int8 __ovld __cnfn convert_int8_rtz(long8);
+int8 __ovld __cnfn convert_int8_sat_rtz(long8);
+int8 __ovld __cnfn convert_int8_rtp(long8);
+int8 __ovld __cnfn convert_int8_sat_rtp(long8);
+int8 __ovld __cnfn convert_int8_rtn(long8);
+int8 __ovld __cnfn convert_int8_sat_rtn(long8);
+int8 __ovld __cnfn convert_int8(long8);
+int8 __ovld __cnfn convert_int8_sat(long8);
+int8 __ovld __cnfn convert_int8_rte(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rte(ulong8);
+int8 __ovld __cnfn convert_int8_rtz(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtz(ulong8);
+int8 __ovld __cnfn convert_int8_rtp(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtp(ulong8);
+int8 __ovld __cnfn convert_int8_rtn(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtn(ulong8);
+int8 __ovld __cnfn convert_int8(ulong8);
+int8 __ovld __cnfn convert_int8_sat(ulong8);
+int8 __ovld __cnfn convert_int8_rte(float8);
+int8 __ovld __cnfn convert_int8_sat_rte(float8);
+int8 __ovld __cnfn convert_int8_rtz(float8);
+int8 __ovld __cnfn convert_int8_sat_rtz(float8);
+int8 __ovld __cnfn convert_int8_rtp(float8);
+int8 __ovld __cnfn convert_int8_sat_rtp(float8);
+int8 __ovld __cnfn convert_int8_rtn(float8);
+int8 __ovld __cnfn convert_int8_sat_rtn(float8);
+int8 __ovld __cnfn convert_int8(float8);
+int8 __ovld __cnfn convert_int8_sat(float8);
+uint8 __ovld __cnfn convert_uint8_rte(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(char8);
+uint8 __ovld __cnfn convert_uint8_rtz(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(char8);
+uint8 __ovld __cnfn convert_uint8_rtp(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(char8);
+uint8 __ovld __cnfn convert_uint8_rtn(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(char8);
+uint8 __ovld __cnfn convert_uint8(char8);
+uint8 __ovld __cnfn convert_uint8_sat(char8);
+uint8 __ovld __cnfn convert_uint8_rte(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtz(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtp(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtn(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(uchar8);
+uint8 __ovld __cnfn convert_uint8(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat(uchar8);
+uint8 __ovld __cnfn convert_uint8_rte(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(short8);
+uint8 __ovld __cnfn convert_uint8_rtz(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(short8);
+uint8 __ovld __cnfn convert_uint8_rtp(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(short8);
+uint8 __ovld __cnfn convert_uint8_rtn(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(short8);
+uint8 __ovld __cnfn convert_uint8(short8);
+uint8 __ovld __cnfn convert_uint8_sat(short8);
+uint8 __ovld __cnfn convert_uint8_rte(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtz(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtp(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtn(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(ushort8);
+uint8 __ovld __cnfn convert_uint8(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat(ushort8);
+uint8 __ovld __cnfn convert_uint8_rte(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(int8);
+uint8 __ovld __cnfn convert_uint8_rtz(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(int8);
+uint8 __ovld __cnfn convert_uint8_rtp(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(int8);
+uint8 __ovld __cnfn convert_uint8_rtn(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(int8);
+uint8 __ovld __cnfn convert_uint8(int8);
+uint8 __ovld __cnfn convert_uint8_sat(int8);
+uint8 __ovld __cnfn convert_uint8_rte(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(uint8);
+uint8 __ovld __cnfn convert_uint8_rtz(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(uint8);
+uint8 __ovld __cnfn convert_uint8_rtp(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(uint8);
+uint8 __ovld __cnfn convert_uint8_rtn(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(uint8);
+uint8 __ovld __cnfn convert_uint8(uint8);
+uint8 __ovld __cnfn convert_uint8_sat(uint8);
+uint8 __ovld __cnfn convert_uint8_rte(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(long8);
+uint8 __ovld __cnfn convert_uint8_rtz(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(long8);
+uint8 __ovld __cnfn convert_uint8_rtp(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(long8);
+uint8 __ovld __cnfn convert_uint8_rtn(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(long8);
+uint8 __ovld __cnfn convert_uint8(long8);
+uint8 __ovld __cnfn convert_uint8_sat(long8);
+uint8 __ovld __cnfn convert_uint8_rte(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtz(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtp(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtn(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(ulong8);
+uint8 __ovld __cnfn convert_uint8(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat(ulong8);
+uint8 __ovld __cnfn convert_uint8_rte(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(float8);
+uint8 __ovld __cnfn convert_uint8_rtz(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(float8);
+uint8 __ovld __cnfn convert_uint8_rtp(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(float8);
+uint8 __ovld __cnfn convert_uint8_rtn(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(float8);
+uint8 __ovld __cnfn convert_uint8(float8);
+uint8 __ovld __cnfn convert_uint8_sat(float8);
+long8 __ovld __cnfn convert_long8_rte(char8);
+long8 __ovld __cnfn convert_long8_sat_rte(char8);
+long8 __ovld __cnfn convert_long8_rtz(char8);
+long8 __ovld __cnfn convert_long8_sat_rtz(char8);
+long8 __ovld __cnfn convert_long8_rtp(char8);
+long8 __ovld __cnfn convert_long8_sat_rtp(char8);
+long8 __ovld __cnfn convert_long8_rtn(char8);
+long8 __ovld __cnfn convert_long8_sat_rtn(char8);
+long8 __ovld __cnfn convert_long8(char8);
+long8 __ovld __cnfn convert_long8_sat(char8);
+long8 __ovld __cnfn convert_long8_rte(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rte(uchar8);
+long8 __ovld __cnfn convert_long8_rtz(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtz(uchar8);
+long8 __ovld __cnfn convert_long8_rtp(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtp(uchar8);
+long8 __ovld __cnfn convert_long8_rtn(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtn(uchar8);
+long8 __ovld __cnfn convert_long8(uchar8);
+long8 __ovld __cnfn convert_long8_sat(uchar8);
+long8 __ovld __cnfn convert_long8_rte(short8);
+long8 __ovld __cnfn convert_long8_sat_rte(short8);
+long8 __ovld __cnfn convert_long8_rtz(short8);
+long8 __ovld __cnfn convert_long8_sat_rtz(short8);
+long8 __ovld __cnfn convert_long8_rtp(short8);
+long8 __ovld __cnfn convert_long8_sat_rtp(short8);
+long8 __ovld __cnfn convert_long8_rtn(short8);
+long8 __ovld __cnfn convert_long8_sat_rtn(short8);
+long8 __ovld __cnfn convert_long8(short8);
+long8 __ovld __cnfn convert_long8_sat(short8);
+long8 __ovld __cnfn convert_long8_rte(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rte(ushort8);
+long8 __ovld __cnfn convert_long8_rtz(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtz(ushort8);
+long8 __ovld __cnfn convert_long8_rtp(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtp(ushort8);
+long8 __ovld __cnfn convert_long8_rtn(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtn(ushort8);
+long8 __ovld __cnfn convert_long8(ushort8);
+long8 __ovld __cnfn convert_long8_sat(ushort8);
+long8 __ovld __cnfn convert_long8_rte(int8);
+long8 __ovld __cnfn convert_long8_sat_rte(int8);
+long8 __ovld __cnfn convert_long8_rtz(int8);
+long8 __ovld __cnfn convert_long8_sat_rtz(int8);
+long8 __ovld __cnfn convert_long8_rtp(int8);
+long8 __ovld __cnfn convert_long8_sat_rtp(int8);
+long8 __ovld __cnfn convert_long8_rtn(int8);
+long8 __ovld __cnfn convert_long8_sat_rtn(int8);
+long8 __ovld __cnfn convert_long8(int8);
+long8 __ovld __cnfn convert_long8_sat(int8);
+long8 __ovld __cnfn convert_long8_rte(uint8);
+long8 __ovld __cnfn convert_long8_sat_rte(uint8);
+long8 __ovld __cnfn convert_long8_rtz(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtz(uint8);
+long8 __ovld __cnfn convert_long8_rtp(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtp(uint8);
+long8 __ovld __cnfn convert_long8_rtn(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtn(uint8);
+long8 __ovld __cnfn convert_long8(uint8);
+long8 __ovld __cnfn convert_long8_sat(uint8);
+long8 __ovld __cnfn convert_long8_rte(long8);
+long8 __ovld __cnfn convert_long8_sat_rte(long8);
+long8 __ovld __cnfn convert_long8_rtz(long8);
+long8 __ovld __cnfn convert_long8_sat_rtz(long8);
+long8 __ovld __cnfn convert_long8_rtp(long8);
+long8 __ovld __cnfn convert_long8_sat_rtp(long8);
+long8 __ovld __cnfn convert_long8_rtn(long8);
+long8 __ovld __cnfn convert_long8_sat_rtn(long8);
+long8 __ovld __cnfn convert_long8(long8);
+long8 __ovld __cnfn convert_long8_sat(long8);
+long8 __ovld __cnfn convert_long8_rte(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rte(ulong8);
+long8 __ovld __cnfn convert_long8_rtz(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtz(ulong8);
+long8 __ovld __cnfn convert_long8_rtp(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtp(ulong8);
+long8 __ovld __cnfn convert_long8_rtn(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtn(ulong8);
+long8 __ovld __cnfn convert_long8(ulong8);
+long8 __ovld __cnfn convert_long8_sat(ulong8);
+long8 __ovld __cnfn convert_long8_rte(float8);
+long8 __ovld __cnfn convert_long8_sat_rte(float8);
+long8 __ovld __cnfn convert_long8_rtz(float8);
+long8 __ovld __cnfn convert_long8_sat_rtz(float8);
+long8 __ovld __cnfn convert_long8_rtp(float8);
+long8 __ovld __cnfn convert_long8_sat_rtp(float8);
+long8 __ovld __cnfn convert_long8_rtn(float8);
+long8 __ovld __cnfn convert_long8_sat_rtn(float8);
+long8 __ovld __cnfn convert_long8(float8);
+long8 __ovld __cnfn convert_long8_sat(float8);
+ulong8 __ovld __cnfn convert_ulong8_rte(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(char8);
+ulong8 __ovld __cnfn convert_ulong8(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat(char8);
+ulong8 __ovld __cnfn convert_ulong8_rte(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uchar8);
+ulong8 __ovld __cnfn convert_ulong8(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rte(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(short8);
+ulong8 __ovld __cnfn convert_ulong8(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat(short8);
+ulong8 __ovld __cnfn convert_ulong8_rte(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ushort8);
+ulong8 __ovld __cnfn convert_ulong8(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rte(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(int8);
+ulong8 __ovld __cnfn convert_ulong8(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat(int8);
+ulong8 __ovld __cnfn convert_ulong8_rte(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uint8);
+ulong8 __ovld __cnfn convert_ulong8(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rte(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(long8);
+ulong8 __ovld __cnfn convert_ulong8(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat(long8);
+ulong8 __ovld __cnfn convert_ulong8_rte(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ulong8);
+ulong8 __ovld __cnfn convert_ulong8(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rte(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(float8);
+ulong8 __ovld __cnfn convert_ulong8(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat(float8);
+float8 __ovld __cnfn convert_float8_rte(char8);
+float8 __ovld __cnfn convert_float8_rtz(char8);
+float8 __ovld __cnfn convert_float8_rtp(char8);
+float8 __ovld __cnfn convert_float8_rtn(char8);
+float8 __ovld __cnfn convert_float8(char8);
+float8 __ovld __cnfn convert_float8_rte(uchar8);
+float8 __ovld __cnfn convert_float8_rtz(uchar8);
+float8 __ovld __cnfn convert_float8_rtp(uchar8);
+float8 __ovld __cnfn convert_float8_rtn(uchar8);
+float8 __ovld __cnfn convert_float8(uchar8);
+float8 __ovld __cnfn convert_float8_rte(short8);
+float8 __ovld __cnfn convert_float8_rtz(short8);
+float8 __ovld __cnfn convert_float8_rtp(short8);
+float8 __ovld __cnfn convert_float8_rtn(short8);
+float8 __ovld __cnfn convert_float8(short8);
+float8 __ovld __cnfn convert_float8_rte(ushort8);
+float8 __ovld __cnfn convert_float8_rtz(ushort8);
+float8 __ovld __cnfn convert_float8_rtp(ushort8);
+float8 __ovld __cnfn convert_float8_rtn(ushort8);
+float8 __ovld __cnfn convert_float8(ushort8);
+float8 __ovld __cnfn convert_float8_rte(int8);
+float8 __ovld __cnfn convert_float8_rtz(int8);
+float8 __ovld __cnfn convert_float8_rtp(int8);
+float8 __ovld __cnfn convert_float8_rtn(int8);
+float8 __ovld __cnfn convert_float8(int8);
+float8 __ovld __cnfn convert_float8_rte(uint8);
+float8 __ovld __cnfn convert_float8_rtz(uint8);
+float8 __ovld __cnfn convert_float8_rtp(uint8);
+float8 __ovld __cnfn convert_float8_rtn(uint8);
+float8 __ovld __cnfn convert_float8(uint8);
+float8 __ovld __cnfn convert_float8_rte(long8);
+float8 __ovld __cnfn convert_float8_rtz(long8);
+float8 __ovld __cnfn convert_float8_rtp(long8);
+float8 __ovld __cnfn convert_float8_rtn(long8);
+float8 __ovld __cnfn convert_float8(long8);
+float8 __ovld __cnfn convert_float8_rte(ulong8);
+float8 __ovld __cnfn convert_float8_rtz(ulong8);
+float8 __ovld __cnfn convert_float8_rtp(ulong8);
+float8 __ovld __cnfn convert_float8_rtn(ulong8);
+float8 __ovld __cnfn convert_float8(ulong8);
+float8 __ovld __cnfn convert_float8_rte(float8);
+float8 __ovld __cnfn convert_float8_rtz(float8);
+float8 __ovld __cnfn convert_float8_rtp(float8);
+float8 __ovld __cnfn convert_float8_rtn(float8);
+float8 __ovld __cnfn convert_float8(float8);
+char16 __ovld __cnfn convert_char16_rte(char16);
+char16 __ovld __cnfn convert_char16_sat_rte(char16);
+char16 __ovld __cnfn convert_char16_rtz(char16);
+char16 __ovld __cnfn convert_char16_sat_rtz(char16);
+char16 __ovld __cnfn convert_char16_rtp(char16);
+char16 __ovld __cnfn convert_char16_sat_rtp(char16);
+char16 __ovld __cnfn convert_char16_rtn(char16);
+char16 __ovld __cnfn convert_char16_sat_rtn(char16);
+char16 __ovld __cnfn convert_char16(char16);
+char16 __ovld __cnfn convert_char16_sat(char16);
+char16 __ovld __cnfn convert_char16_rte(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rte(uchar16);
+char16 __ovld __cnfn convert_char16_rtz(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtz(uchar16);
+char16 __ovld __cnfn convert_char16_rtp(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtp(uchar16);
+char16 __ovld __cnfn convert_char16_rtn(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtn(uchar16);
+char16 __ovld __cnfn convert_char16(uchar16);
+char16 __ovld __cnfn convert_char16_sat(uchar16);
+char16 __ovld __cnfn convert_char16_rte(short16);
+char16 __ovld __cnfn convert_char16_sat_rte(short16);
+char16 __ovld __cnfn convert_char16_rtz(short16);
+char16 __ovld __cnfn convert_char16_sat_rtz(short16);
+char16 __ovld __cnfn convert_char16_rtp(short16);
+char16 __ovld __cnfn convert_char16_sat_rtp(short16);
+char16 __ovld __cnfn convert_char16_rtn(short16);
+char16 __ovld __cnfn convert_char16_sat_rtn(short16);
+char16 __ovld __cnfn convert_char16(short16);
+char16 __ovld __cnfn convert_char16_sat(short16);
+char16 __ovld __cnfn convert_char16_rte(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rte(ushort16);
+char16 __ovld __cnfn convert_char16_rtz(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtz(ushort16);
+char16 __ovld __cnfn convert_char16_rtp(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtp(ushort16);
+char16 __ovld __cnfn convert_char16_rtn(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtn(ushort16);
+char16 __ovld __cnfn convert_char16(ushort16);
+char16 __ovld __cnfn convert_char16_sat(ushort16);
+char16 __ovld __cnfn convert_char16_rte(int16);
+char16 __ovld __cnfn convert_char16_sat_rte(int16);
+char16 __ovld __cnfn convert_char16_rtz(int16);
+char16 __ovld __cnfn convert_char16_sat_rtz(int16);
+char16 __ovld __cnfn convert_char16_rtp(int16);
+char16 __ovld __cnfn convert_char16_sat_rtp(int16);
+char16 __ovld __cnfn convert_char16_rtn(int16);
+char16 __ovld __cnfn convert_char16_sat_rtn(int16);
+char16 __ovld __cnfn convert_char16(int16);
+char16 __ovld __cnfn convert_char16_sat(int16);
+char16 __ovld __cnfn convert_char16_rte(uint16);
+char16 __ovld __cnfn convert_char16_sat_rte(uint16);
+char16 __ovld __cnfn convert_char16_rtz(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtz(uint16);
+char16 __ovld __cnfn convert_char16_rtp(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtp(uint16);
+char16 __ovld __cnfn convert_char16_rtn(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtn(uint16);
+char16 __ovld __cnfn convert_char16(uint16);
+char16 __ovld __cnfn convert_char16_sat(uint16);
+char16 __ovld __cnfn convert_char16_rte(long16);
+char16 __ovld __cnfn convert_char16_sat_rte(long16);
+char16 __ovld __cnfn convert_char16_rtz(long16);
+char16 __ovld __cnfn convert_char16_sat_rtz(long16);
+char16 __ovld __cnfn convert_char16_rtp(long16);
+char16 __ovld __cnfn convert_char16_sat_rtp(long16);
+char16 __ovld __cnfn convert_char16_rtn(long16);
+char16 __ovld __cnfn convert_char16_sat_rtn(long16);
+char16 __ovld __cnfn convert_char16(long16);
+char16 __ovld __cnfn convert_char16_sat(long16);
+char16 __ovld __cnfn convert_char16_rte(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rte(ulong16);
+char16 __ovld __cnfn convert_char16_rtz(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtz(ulong16);
+char16 __ovld __cnfn convert_char16_rtp(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtp(ulong16);
+char16 __ovld __cnfn convert_char16_rtn(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtn(ulong16);
+char16 __ovld __cnfn convert_char16(ulong16);
+char16 __ovld __cnfn convert_char16_sat(ulong16);
+char16 __ovld __cnfn convert_char16_rte(float16);
+char16 __ovld __cnfn convert_char16_sat_rte(float16);
+char16 __ovld __cnfn convert_char16_rtz(float16);
+char16 __ovld __cnfn convert_char16_sat_rtz(float16);
+char16 __ovld __cnfn convert_char16_rtp(float16);
+char16 __ovld __cnfn convert_char16_sat_rtp(float16);
+char16 __ovld __cnfn convert_char16_rtn(float16);
+char16 __ovld __cnfn convert_char16_sat_rtn(float16);
+char16 __ovld __cnfn convert_char16(float16);
+char16 __ovld __cnfn convert_char16_sat(float16);
+uchar16 __ovld __cnfn convert_uchar16_rte(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(char16);
+uchar16 __ovld __cnfn convert_uchar16(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat(char16);
+uchar16 __ovld __cnfn convert_uchar16_rte(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uchar16);
+uchar16 __ovld __cnfn convert_uchar16(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rte(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(short16);
+uchar16 __ovld __cnfn convert_uchar16(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat(short16);
+uchar16 __ovld __cnfn convert_uchar16_rte(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ushort16);
+uchar16 __ovld __cnfn convert_uchar16(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rte(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(int16);
+uchar16 __ovld __cnfn convert_uchar16(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat(int16);
+uchar16 __ovld __cnfn convert_uchar16_rte(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uint16);
+uchar16 __ovld __cnfn convert_uchar16(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rte(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(long16);
+uchar16 __ovld __cnfn convert_uchar16(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat(long16);
+uchar16 __ovld __cnfn convert_uchar16_rte(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ulong16);
+uchar16 __ovld __cnfn convert_uchar16(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rte(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(float16);
+uchar16 __ovld __cnfn convert_uchar16(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat(float16);
+short16 __ovld __cnfn convert_short16_rte(char16);
+short16 __ovld __cnfn convert_short16_sat_rte(char16);
+short16 __ovld __cnfn convert_short16_rtz(char16);
+short16 __ovld __cnfn convert_short16_sat_rtz(char16);
+short16 __ovld __cnfn convert_short16_rtp(char16);
+short16 __ovld __cnfn convert_short16_sat_rtp(char16);
+short16 __ovld __cnfn convert_short16_rtn(char16);
+short16 __ovld __cnfn convert_short16_sat_rtn(char16);
+short16 __ovld __cnfn convert_short16(char16);
+short16 __ovld __cnfn convert_short16_sat(char16);
+short16 __ovld __cnfn convert_short16_rte(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rte(uchar16);
+short16 __ovld __cnfn convert_short16_rtz(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtz(uchar16);
+short16 __ovld __cnfn convert_short16_rtp(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtp(uchar16);
+short16 __ovld __cnfn convert_short16_rtn(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtn(uchar16);
+short16 __ovld __cnfn convert_short16(uchar16);
+short16 __ovld __cnfn convert_short16_sat(uchar16);
+short16 __ovld __cnfn convert_short16_rte(short16);
+short16 __ovld __cnfn convert_short16_sat_rte(short16);
+short16 __ovld __cnfn convert_short16_rtz(short16);
+short16 __ovld __cnfn convert_short16_sat_rtz(short16);
+short16 __ovld __cnfn convert_short16_rtp(short16);
+short16 __ovld __cnfn convert_short16_sat_rtp(short16);
+short16 __ovld __cnfn convert_short16_rtn(short16);
+short16 __ovld __cnfn convert_short16_sat_rtn(short16);
+short16 __ovld __cnfn convert_short16(short16);
+short16 __ovld __cnfn convert_short16_sat(short16);
+short16 __ovld __cnfn convert_short16_rte(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rte(ushort16);
+short16 __ovld __cnfn convert_short16_rtz(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtz(ushort16);
+short16 __ovld __cnfn convert_short16_rtp(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtp(ushort16);
+short16 __ovld __cnfn convert_short16_rtn(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtn(ushort16);
+short16 __ovld __cnfn convert_short16(ushort16);
+short16 __ovld __cnfn convert_short16_sat(ushort16);
+short16 __ovld __cnfn convert_short16_rte(int16);
+short16 __ovld __cnfn convert_short16_sat_rte(int16);
+short16 __ovld __cnfn convert_short16_rtz(int16);
+short16 __ovld __cnfn convert_short16_sat_rtz(int16);
+short16 __ovld __cnfn convert_short16_rtp(int16);
+short16 __ovld __cnfn convert_short16_sat_rtp(int16);
+short16 __ovld __cnfn convert_short16_rtn(int16);
+short16 __ovld __cnfn convert_short16_sat_rtn(int16);
+short16 __ovld __cnfn convert_short16(int16);
+short16 __ovld __cnfn convert_short16_sat(int16);
+short16 __ovld __cnfn convert_short16_rte(uint16);
+short16 __ovld __cnfn convert_short16_sat_rte(uint16);
+short16 __ovld __cnfn convert_short16_rtz(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtz(uint16);
+short16 __ovld __cnfn convert_short16_rtp(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtp(uint16);
+short16 __ovld __cnfn convert_short16_rtn(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtn(uint16);
+short16 __ovld __cnfn convert_short16(uint16);
+short16 __ovld __cnfn convert_short16_sat(uint16);
+short16 __ovld __cnfn convert_short16_rte(long16);
+short16 __ovld __cnfn convert_short16_sat_rte(long16);
+short16 __ovld __cnfn convert_short16_rtz(long16);
+short16 __ovld __cnfn convert_short16_sat_rtz(long16);
+short16 __ovld __cnfn convert_short16_rtp(long16);
+short16 __ovld __cnfn convert_short16_sat_rtp(long16);
+short16 __ovld __cnfn convert_short16_rtn(long16);
+short16 __ovld __cnfn convert_short16_sat_rtn(long16);
+short16 __ovld __cnfn convert_short16(long16);
+short16 __ovld __cnfn convert_short16_sat(long16);
+short16 __ovld __cnfn convert_short16_rte(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rte(ulong16);
+short16 __ovld __cnfn convert_short16_rtz(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtz(ulong16);
+short16 __ovld __cnfn convert_short16_rtp(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtp(ulong16);
+short16 __ovld __cnfn convert_short16_rtn(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtn(ulong16);
+short16 __ovld __cnfn convert_short16(ulong16);
+short16 __ovld __cnfn convert_short16_sat(ulong16);
+short16 __ovld __cnfn convert_short16_rte(float16);
+short16 __ovld __cnfn convert_short16_sat_rte(float16);
+short16 __ovld __cnfn convert_short16_rtz(float16);
+short16 __ovld __cnfn convert_short16_sat_rtz(float16);
+short16 __ovld __cnfn convert_short16_rtp(float16);
+short16 __ovld __cnfn convert_short16_sat_rtp(float16);
+short16 __ovld __cnfn convert_short16_rtn(float16);
+short16 __ovld __cnfn convert_short16_sat_rtn(float16);
+short16 __ovld __cnfn convert_short16(float16);
+short16 __ovld __cnfn convert_short16_sat(float16);
+ushort16 __ovld __cnfn convert_ushort16_rte(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(char16);
+ushort16 __ovld __cnfn convert_ushort16(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat(char16);
+ushort16 __ovld __cnfn convert_ushort16_rte(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uchar16);
+ushort16 __ovld __cnfn convert_ushort16(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rte(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(short16);
+ushort16 __ovld __cnfn convert_ushort16(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat(short16);
+ushort16 __ovld __cnfn convert_ushort16_rte(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ushort16);
+ushort16 __ovld __cnfn convert_ushort16(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rte(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(int16);
+ushort16 __ovld __cnfn convert_ushort16(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat(int16);
+ushort16 __ovld __cnfn convert_ushort16_rte(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uint16);
+ushort16 __ovld __cnfn convert_ushort16(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rte(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(long16);
+ushort16 __ovld __cnfn convert_ushort16(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat(long16);
+ushort16 __ovld __cnfn convert_ushort16_rte(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ulong16);
+ushort16 __ovld __cnfn convert_ushort16(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rte(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(float16);
+ushort16 __ovld __cnfn convert_ushort16(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat(float16);
+int16 __ovld __cnfn convert_int16_rte(char16);
+int16 __ovld __cnfn convert_int16_sat_rte(char16);
+int16 __ovld __cnfn convert_int16_rtz(char16);
+int16 __ovld __cnfn convert_int16_sat_rtz(char16);
+int16 __ovld __cnfn convert_int16_rtp(char16);
+int16 __ovld __cnfn convert_int16_sat_rtp(char16);
+int16 __ovld __cnfn convert_int16_rtn(char16);
+int16 __ovld __cnfn convert_int16_sat_rtn(char16);
+int16 __ovld __cnfn convert_int16(char16);
+int16 __ovld __cnfn convert_int16_sat(char16);
+int16 __ovld __cnfn convert_int16_rte(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rte(uchar16);
+int16 __ovld __cnfn convert_int16_rtz(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtz(uchar16);
+int16 __ovld __cnfn convert_int16_rtp(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtp(uchar16);
+int16 __ovld __cnfn convert_int16_rtn(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtn(uchar16);
+int16 __ovld __cnfn convert_int16(uchar16);
+int16 __ovld __cnfn convert_int16_sat(uchar16);
+int16 __ovld __cnfn convert_int16_rte(short16);
+int16 __ovld __cnfn convert_int16_sat_rte(short16);
+int16 __ovld __cnfn convert_int16_rtz(short16);
+int16 __ovld __cnfn convert_int16_sat_rtz(short16);
+int16 __ovld __cnfn convert_int16_rtp(short16);
+int16 __ovld __cnfn convert_int16_sat_rtp(short16);
+int16 __ovld __cnfn convert_int16_rtn(short16);
+int16 __ovld __cnfn convert_int16_sat_rtn(short16);
+int16 __ovld __cnfn convert_int16(short16);
+int16 __ovld __cnfn convert_int16_sat(short16);
+int16 __ovld __cnfn convert_int16_rte(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rte(ushort16);
+int16 __ovld __cnfn convert_int16_rtz(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtz(ushort16);
+int16 __ovld __cnfn convert_int16_rtp(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtp(ushort16);
+int16 __ovld __cnfn convert_int16_rtn(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtn(ushort16);
+int16 __ovld __cnfn convert_int16(ushort16);
+int16 __ovld __cnfn convert_int16_sat(ushort16);
+int16 __ovld __cnfn convert_int16_rte(int16);
+int16 __ovld __cnfn convert_int16_sat_rte(int16);
+int16 __ovld __cnfn convert_int16_rtz(int16);
+int16 __ovld __cnfn convert_int16_sat_rtz(int16);
+int16 __ovld __cnfn convert_int16_rtp(int16);
+int16 __ovld __cnfn convert_int16_sat_rtp(int16);
+int16 __ovld __cnfn convert_int16_rtn(int16);
+int16 __ovld __cnfn convert_int16_sat_rtn(int16);
+int16 __ovld __cnfn convert_int16(int16);
+int16 __ovld __cnfn convert_int16_sat(int16);
+int16 __ovld __cnfn convert_int16_rte(uint16);
+int16 __ovld __cnfn convert_int16_sat_rte(uint16);
+int16 __ovld __cnfn convert_int16_rtz(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtz(uint16);
+int16 __ovld __cnfn convert_int16_rtp(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtp(uint16);
+int16 __ovld __cnfn convert_int16_rtn(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtn(uint16);
+int16 __ovld __cnfn convert_int16(uint16);
+int16 __ovld __cnfn convert_int16_sat(uint16);
+int16 __ovld __cnfn convert_int16_rte(long16);
+int16 __ovld __cnfn convert_int16_sat_rte(long16);
+int16 __ovld __cnfn convert_int16_rtz(long16);
+int16 __ovld __cnfn convert_int16_sat_rtz(long16);
+int16 __ovld __cnfn convert_int16_rtp(long16);
+int16 __ovld __cnfn convert_int16_sat_rtp(long16);
+int16 __ovld __cnfn convert_int16_rtn(long16);
+int16 __ovld __cnfn convert_int16_sat_rtn(long16);
+int16 __ovld __cnfn convert_int16(long16);
+int16 __ovld __cnfn convert_int16_sat(long16);
+int16 __ovld __cnfn convert_int16_rte(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rte(ulong16);
+int16 __ovld __cnfn convert_int16_rtz(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtz(ulong16);
+int16 __ovld __cnfn convert_int16_rtp(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtp(ulong16);
+int16 __ovld __cnfn convert_int16_rtn(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtn(ulong16);
+int16 __ovld __cnfn convert_int16(ulong16);
+int16 __ovld __cnfn convert_int16_sat(ulong16);
+int16 __ovld __cnfn convert_int16_rte(float16);
+int16 __ovld __cnfn convert_int16_sat_rte(float16);
+int16 __ovld __cnfn convert_int16_rtz(float16);
+int16 __ovld __cnfn convert_int16_sat_rtz(float16);
+int16 __ovld __cnfn convert_int16_rtp(float16);
+int16 __ovld __cnfn convert_int16_sat_rtp(float16);
+int16 __ovld __cnfn convert_int16_rtn(float16);
+int16 __ovld __cnfn convert_int16_sat_rtn(float16);
+int16 __ovld __cnfn convert_int16(float16);
+int16 __ovld __cnfn convert_int16_sat(float16);
+uint16 __ovld __cnfn convert_uint16_rte(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(char16);
+uint16 __ovld __cnfn convert_uint16_rtz(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(char16);
+uint16 __ovld __cnfn convert_uint16_rtp(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(char16);
+uint16 __ovld __cnfn convert_uint16_rtn(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(char16);
+uint16 __ovld __cnfn convert_uint16(char16);
+uint16 __ovld __cnfn convert_uint16_sat(char16);
+uint16 __ovld __cnfn convert_uint16_rte(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtz(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtp(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtn(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(uchar16);
+uint16 __ovld __cnfn convert_uint16(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat(uchar16);
+uint16 __ovld __cnfn convert_uint16_rte(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(short16);
+uint16 __ovld __cnfn convert_uint16_rtz(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(short16);
+uint16 __ovld __cnfn convert_uint16_rtp(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(short16);
+uint16 __ovld __cnfn convert_uint16_rtn(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(short16);
+uint16 __ovld __cnfn convert_uint16(short16);
+uint16 __ovld __cnfn convert_uint16_sat(short16);
+uint16 __ovld __cnfn convert_uint16_rte(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtz(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtp(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtn(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(ushort16);
+uint16 __ovld __cnfn convert_uint16(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat(ushort16);
+uint16 __ovld __cnfn convert_uint16_rte(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(int16);
+uint16 __ovld __cnfn convert_uint16_rtz(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(int16);
+uint16 __ovld __cnfn convert_uint16_rtp(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(int16);
+uint16 __ovld __cnfn convert_uint16_rtn(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(int16);
+uint16 __ovld __cnfn convert_uint16(int16);
+uint16 __ovld __cnfn convert_uint16_sat(int16);
+uint16 __ovld __cnfn convert_uint16_rte(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(uint16);
+uint16 __ovld __cnfn convert_uint16_rtz(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(uint16);
+uint16 __ovld __cnfn convert_uint16_rtp(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(uint16);
+uint16 __ovld __cnfn convert_uint16_rtn(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(uint16);
+uint16 __ovld __cnfn convert_uint16(uint16);
+uint16 __ovld __cnfn convert_uint16_sat(uint16);
+uint16 __ovld __cnfn convert_uint16_rte(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(long16);
+uint16 __ovld __cnfn convert_uint16_rtz(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(long16);
+uint16 __ovld __cnfn convert_uint16_rtp(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(long16);
+uint16 __ovld __cnfn convert_uint16_rtn(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(long16);
+uint16 __ovld __cnfn convert_uint16(long16);
+uint16 __ovld __cnfn convert_uint16_sat(long16);
+uint16 __ovld __cnfn convert_uint16_rte(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtz(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtp(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtn(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(ulong16);
+uint16 __ovld __cnfn convert_uint16(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat(ulong16);
+uint16 __ovld __cnfn convert_uint16_rte(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(float16);
+uint16 __ovld __cnfn convert_uint16_rtz(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(float16);
+uint16 __ovld __cnfn convert_uint16_rtp(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(float16);
+uint16 __ovld __cnfn convert_uint16_rtn(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(float16);
+uint16 __ovld __cnfn convert_uint16(float16);
+uint16 __ovld __cnfn convert_uint16_sat(float16);
+long16 __ovld __cnfn convert_long16_rte(char16);
+long16 __ovld __cnfn convert_long16_sat_rte(char16);
+long16 __ovld __cnfn convert_long16_rtz(char16);
+long16 __ovld __cnfn convert_long16_sat_rtz(char16);
+long16 __ovld __cnfn convert_long16_rtp(char16);
+long16 __ovld __cnfn convert_long16_sat_rtp(char16);
+long16 __ovld __cnfn convert_long16_rtn(char16);
+long16 __ovld __cnfn convert_long16_sat_rtn(char16);
+long16 __ovld __cnfn convert_long16(char16);
+long16 __ovld __cnfn convert_long16_sat(char16);
+long16 __ovld __cnfn convert_long16_rte(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rte(uchar16);
+long16 __ovld __cnfn convert_long16_rtz(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtz(uchar16);
+long16 __ovld __cnfn convert_long16_rtp(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtp(uchar16);
+long16 __ovld __cnfn convert_long16_rtn(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtn(uchar16);
+long16 __ovld __cnfn convert_long16(uchar16);
+long16 __ovld __cnfn convert_long16_sat(uchar16);
+long16 __ovld __cnfn convert_long16_rte(short16);
+long16 __ovld __cnfn convert_long16_sat_rte(short16);
+long16 __ovld __cnfn convert_long16_rtz(short16);
+long16 __ovld __cnfn convert_long16_sat_rtz(short16);
+long16 __ovld __cnfn convert_long16_rtp(short16);
+long16 __ovld __cnfn convert_long16_sat_rtp(short16);
+long16 __ovld __cnfn convert_long16_rtn(short16);
+long16 __ovld __cnfn convert_long16_sat_rtn(short16);
+long16 __ovld __cnfn convert_long16(short16);
+long16 __ovld __cnfn convert_long16_sat(short16);
+long16 __ovld __cnfn convert_long16_rte(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rte(ushort16);
+long16 __ovld __cnfn convert_long16_rtz(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtz(ushort16);
+long16 __ovld __cnfn convert_long16_rtp(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtp(ushort16);
+long16 __ovld __cnfn convert_long16_rtn(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtn(ushort16);
+long16 __ovld __cnfn convert_long16(ushort16);
+long16 __ovld __cnfn convert_long16_sat(ushort16);
+long16 __ovld __cnfn convert_long16_rte(int16);
+long16 __ovld __cnfn convert_long16_sat_rte(int16);
+long16 __ovld __cnfn convert_long16_rtz(int16);
+long16 __ovld __cnfn convert_long16_sat_rtz(int16);
+long16 __ovld __cnfn convert_long16_rtp(int16);
+long16 __ovld __cnfn convert_long16_sat_rtp(int16);
+long16 __ovld __cnfn convert_long16_rtn(int16);
+long16 __ovld __cnfn convert_long16_sat_rtn(int16);
+long16 __ovld __cnfn convert_long16(int16);
+long16 __ovld __cnfn convert_long16_sat(int16);
+long16 __ovld __cnfn convert_long16_rte(uint16);
+long16 __ovld __cnfn convert_long16_sat_rte(uint16);
+long16 __ovld __cnfn convert_long16_rtz(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtz(uint16);
+long16 __ovld __cnfn convert_long16_rtp(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtp(uint16);
+long16 __ovld __cnfn convert_long16_rtn(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtn(uint16);
+long16 __ovld __cnfn convert_long16(uint16);
+long16 __ovld __cnfn convert_long16_sat(uint16);
+long16 __ovld __cnfn convert_long16_rte(long16);
+long16 __ovld __cnfn convert_long16_sat_rte(long16);
+long16 __ovld __cnfn convert_long16_rtz(long16);
+long16 __ovld __cnfn convert_long16_sat_rtz(long16);
+long16 __ovld __cnfn convert_long16_rtp(long16);
+long16 __ovld __cnfn convert_long16_sat_rtp(long16);
+long16 __ovld __cnfn convert_long16_rtn(long16);
+long16 __ovld __cnfn convert_long16_sat_rtn(long16);
+long16 __ovld __cnfn convert_long16(long16);
+long16 __ovld __cnfn convert_long16_sat(long16);
+long16 __ovld __cnfn convert_long16_rte(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rte(ulong16);
+long16 __ovld __cnfn convert_long16_rtz(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtz(ulong16);
+long16 __ovld __cnfn convert_long16_rtp(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtp(ulong16);
+long16 __ovld __cnfn convert_long16_rtn(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtn(ulong16);
+long16 __ovld __cnfn convert_long16(ulong16);
+long16 __ovld __cnfn convert_long16_sat(ulong16);
+long16 __ovld __cnfn convert_long16_rte(float16);
+long16 __ovld __cnfn convert_long16_sat_rte(float16);
+long16 __ovld __cnfn convert_long16_rtz(float16);
+long16 __ovld __cnfn convert_long16_sat_rtz(float16);
+long16 __ovld __cnfn convert_long16_rtp(float16);
+long16 __ovld __cnfn convert_long16_sat_rtp(float16);
+long16 __ovld __cnfn convert_long16_rtn(float16);
+long16 __ovld __cnfn convert_long16_sat_rtn(float16);
+long16 __ovld __cnfn convert_long16(float16);
+long16 __ovld __cnfn convert_long16_sat(float16);
+ulong16 __ovld __cnfn convert_ulong16_rte(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(char16);
+ulong16 __ovld __cnfn convert_ulong16(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat(char16);
+ulong16 __ovld __cnfn convert_ulong16_rte(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uchar16);
+ulong16 __ovld __cnfn convert_ulong16(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rte(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(short16);
+ulong16 __ovld __cnfn convert_ulong16(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat(short16);
+ulong16 __ovld __cnfn convert_ulong16_rte(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ushort16);
+ulong16 __ovld __cnfn convert_ulong16(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rte(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(int16);
+ulong16 __ovld __cnfn convert_ulong16(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat(int16);
+ulong16 __ovld __cnfn convert_ulong16_rte(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uint16);
+ulong16 __ovld __cnfn convert_ulong16(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rte(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(long16);
+ulong16 __ovld __cnfn convert_ulong16(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat(long16);
+ulong16 __ovld __cnfn convert_ulong16_rte(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ulong16);
+ulong16 __ovld __cnfn convert_ulong16(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rte(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(float16);
+ulong16 __ovld __cnfn convert_ulong16(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat(float16);
+float16 __ovld __cnfn convert_float16_rte(char16);
+float16 __ovld __cnfn convert_float16_rtz(char16);
+float16 __ovld __cnfn convert_float16_rtp(char16);
+float16 __ovld __cnfn convert_float16_rtn(char16);
+float16 __ovld __cnfn convert_float16(char16);
+float16 __ovld __cnfn convert_float16_rte(uchar16);
+float16 __ovld __cnfn convert_float16_rtz(uchar16);
+float16 __ovld __cnfn convert_float16_rtp(uchar16);
+float16 __ovld __cnfn convert_float16_rtn(uchar16);
+float16 __ovld __cnfn convert_float16(uchar16);
+float16 __ovld __cnfn convert_float16_rte(short16);
+float16 __ovld __cnfn convert_float16_rtz(short16);
+float16 __ovld __cnfn convert_float16_rtp(short16);
+float16 __ovld __cnfn convert_float16_rtn(short16);
+float16 __ovld __cnfn convert_float16(short16);
+float16 __ovld __cnfn convert_float16_rte(ushort16);
+float16 __ovld __cnfn convert_float16_rtz(ushort16);
+float16 __ovld __cnfn convert_float16_rtp(ushort16);
+float16 __ovld __cnfn convert_float16_rtn(ushort16);
+float16 __ovld __cnfn convert_float16(ushort16);
+float16 __ovld __cnfn convert_float16_rte(int16);
+float16 __ovld __cnfn convert_float16_rtz(int16);
+float16 __ovld __cnfn convert_float16_rtp(int16);
+float16 __ovld __cnfn convert_float16_rtn(int16);
+float16 __ovld __cnfn convert_float16(int16);
+float16 __ovld __cnfn convert_float16_rte(uint16);
+float16 __ovld __cnfn convert_float16_rtz(uint16);
+float16 __ovld __cnfn convert_float16_rtp(uint16);
+float16 __ovld __cnfn convert_float16_rtn(uint16);
+float16 __ovld __cnfn convert_float16(uint16);
+float16 __ovld __cnfn convert_float16_rte(long16);
+float16 __ovld __cnfn convert_float16_rtz(long16);
+float16 __ovld __cnfn convert_float16_rtp(long16);
+float16 __ovld __cnfn convert_float16_rtn(long16);
+float16 __ovld __cnfn convert_float16(long16);
+float16 __ovld __cnfn convert_float16_rte(ulong16);
+float16 __ovld __cnfn convert_float16_rtz(ulong16);
+float16 __ovld __cnfn convert_float16_rtp(ulong16);
+float16 __ovld __cnfn convert_float16_rtn(ulong16);
+float16 __ovld __cnfn convert_float16(ulong16);
+float16 __ovld __cnfn convert_float16_rte(float16);
+float16 __ovld __cnfn convert_float16_rtz(float16);
+float16 __ovld __cnfn convert_float16_rtp(float16);
+float16 __ovld __cnfn convert_float16_rtn(float16);
+float16 __ovld __cnfn convert_float16(float16);
+
+// Conversions with double data type parameters or return value.
+
+#ifdef cl_khr_fp64
+char __ovld __cnfn convert_char(double);
+char __ovld __cnfn convert_char_rte(double);
+char __ovld __cnfn convert_char_rtn(double);
+char __ovld __cnfn convert_char_rtp(double);
+char __ovld __cnfn convert_char_rtz(double);
+char __ovld __cnfn convert_char_sat(double);
+char __ovld __cnfn convert_char_sat_rte(double);
+char __ovld __cnfn convert_char_sat_rtn(double);
+char __ovld __cnfn convert_char_sat_rtp(double);
+char __ovld __cnfn convert_char_sat_rtz(double);
+char2 __ovld __cnfn convert_char2(double2);
+char2 __ovld __cnfn convert_char2_rte(double2);
+char2 __ovld __cnfn convert_char2_rtn(double2);
+char2 __ovld __cnfn convert_char2_rtp(double2);
+char2 __ovld __cnfn convert_char2_rtz(double2);
+char2 __ovld __cnfn convert_char2_sat(double2);
+char2 __ovld __cnfn convert_char2_sat_rte(double2);
+char2 __ovld __cnfn convert_char2_sat_rtn(double2);
+char2 __ovld __cnfn convert_char2_sat_rtp(double2);
+char2 __ovld __cnfn convert_char2_sat_rtz(double2);
+char3 __ovld __cnfn convert_char3(double3);
+char3 __ovld __cnfn convert_char3_rte(double3);
+char3 __ovld __cnfn convert_char3_rtn(double3);
+char3 __ovld __cnfn convert_char3_rtp(double3);
+char3 __ovld __cnfn convert_char3_rtz(double3);
+char3 __ovld __cnfn convert_char3_sat(double3);
+char3 __ovld __cnfn convert_char3_sat_rte(double3);
+char3 __ovld __cnfn convert_char3_sat_rtn(double3);
+char3 __ovld __cnfn convert_char3_sat_rtp(double3);
+char3 __ovld __cnfn convert_char3_sat_rtz(double3);
+char4 __ovld __cnfn convert_char4(double4);
+char4 __ovld __cnfn convert_char4_rte(double4);
+char4 __ovld __cnfn convert_char4_rtn(double4);
+char4 __ovld __cnfn convert_char4_rtp(double4);
+char4 __ovld __cnfn convert_char4_rtz(double4);
+char4 __ovld __cnfn convert_char4_sat(double4);
+char4 __ovld __cnfn convert_char4_sat_rte(double4);
+char4 __ovld __cnfn convert_char4_sat_rtn(double4);
+char4 __ovld __cnfn convert_char4_sat_rtp(double4);
+char4 __ovld __cnfn convert_char4_sat_rtz(double4);
+char8 __ovld __cnfn convert_char8(double8);
+char8 __ovld __cnfn convert_char8_rte(double8);
+char8 __ovld __cnfn convert_char8_rtn(double8);
+char8 __ovld __cnfn convert_char8_rtp(double8);
+char8 __ovld __cnfn convert_char8_rtz(double8);
+char8 __ovld __cnfn convert_char8_sat(double8);
+char8 __ovld __cnfn convert_char8_sat_rte(double8);
+char8 __ovld __cnfn convert_char8_sat_rtn(double8);
+char8 __ovld __cnfn convert_char8_sat_rtp(double8);
+char8 __ovld __cnfn convert_char8_sat_rtz(double8);
+char16 __ovld __cnfn convert_char16(double16);
+char16 __ovld __cnfn convert_char16_rte(double16);
+char16 __ovld __cnfn convert_char16_rtn(double16);
+char16 __ovld __cnfn convert_char16_rtp(double16);
+char16 __ovld __cnfn convert_char16_rtz(double16);
+char16 __ovld __cnfn convert_char16_sat(double16);
+char16 __ovld __cnfn convert_char16_sat_rte(double16);
+char16 __ovld __cnfn convert_char16_sat_rtn(double16);
+char16 __ovld __cnfn convert_char16_sat_rtp(double16);
+char16 __ovld __cnfn convert_char16_sat_rtz(double16);
+
+uchar __ovld __cnfn convert_uchar(double);
+uchar __ovld __cnfn convert_uchar_rte(double);
+uchar __ovld __cnfn convert_uchar_rtn(double);
+uchar __ovld __cnfn convert_uchar_rtp(double);
+uchar __ovld __cnfn convert_uchar_rtz(double);
+uchar __ovld __cnfn convert_uchar_sat(double);
+uchar __ovld __cnfn convert_uchar_sat_rte(double);
+uchar __ovld __cnfn convert_uchar_sat_rtn(double);
+uchar __ovld __cnfn convert_uchar_sat_rtp(double);
+uchar __ovld __cnfn convert_uchar_sat_rtz(double);
+uchar2 __ovld __cnfn convert_uchar2(double2);
+uchar2 __ovld __cnfn convert_uchar2_rte(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(double2);
+uchar3 __ovld __cnfn convert_uchar3(double3);
+uchar3 __ovld __cnfn convert_uchar3_rte(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(double3);
+uchar4 __ovld __cnfn convert_uchar4(double4);
+uchar4 __ovld __cnfn convert_uchar4_rte(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(double4);
+uchar8 __ovld __cnfn convert_uchar8(double8);
+uchar8 __ovld __cnfn convert_uchar8_rte(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(double8);
+uchar16 __ovld __cnfn convert_uchar16(double16);
+uchar16 __ovld __cnfn convert_uchar16_rte(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(double16);
+
+short __ovld __cnfn convert_short(double);
+short __ovld __cnfn convert_short_rte(double);
+short __ovld __cnfn convert_short_rtn(double);
+short __ovld __cnfn convert_short_rtp(double);
+short __ovld __cnfn convert_short_rtz(double);
+short __ovld __cnfn convert_short_sat(double);
+short __ovld __cnfn convert_short_sat_rte(double);
+short __ovld __cnfn convert_short_sat_rtn(double);
+short __ovld __cnfn convert_short_sat_rtp(double);
+short __ovld __cnfn convert_short_sat_rtz(double);
+short2 __ovld __cnfn convert_short2(double2);
+short2 __ovld __cnfn convert_short2_rte(double2);
+short2 __ovld __cnfn convert_short2_rtn(double2);
+short2 __ovld __cnfn convert_short2_rtp(double2);
+short2 __ovld __cnfn convert_short2_rtz(double2);
+short2 __ovld __cnfn convert_short2_sat(double2);
+short2 __ovld __cnfn convert_short2_sat_rte(double2);
+short2 __ovld __cnfn convert_short2_sat_rtn(double2);
+short2 __ovld __cnfn convert_short2_sat_rtp(double2);
+short2 __ovld __cnfn convert_short2_sat_rtz(double2);
+short3 __ovld __cnfn convert_short3(double3);
+short3 __ovld __cnfn convert_short3_rte(double3);
+short3 __ovld __cnfn convert_short3_rtn(double3);
+short3 __ovld __cnfn convert_short3_rtp(double3);
+short3 __ovld __cnfn convert_short3_rtz(double3);
+short3 __ovld __cnfn convert_short3_sat(double3);
+short3 __ovld __cnfn convert_short3_sat_rte(double3);
+short3 __ovld __cnfn convert_short3_sat_rtn(double3);
+short3 __ovld __cnfn convert_short3_sat_rtp(double3);
+short3 __ovld __cnfn convert_short3_sat_rtz(double3);
+short4 __ovld __cnfn convert_short4(double4);
+short4 __ovld __cnfn convert_short4_rte(double4);
+short4 __ovld __cnfn convert_short4_rtn(double4);
+short4 __ovld __cnfn convert_short4_rtp(double4);
+short4 __ovld __cnfn convert_short4_rtz(double4);
+short4 __ovld __cnfn convert_short4_sat(double4);
+short4 __ovld __cnfn convert_short4_sat_rte(double4);
+short4 __ovld __cnfn convert_short4_sat_rtn(double4);
+short4 __ovld __cnfn convert_short4_sat_rtp(double4);
+short4 __ovld __cnfn convert_short4_sat_rtz(double4);
+short8 __ovld __cnfn convert_short8(double8);
+short8 __ovld __cnfn convert_short8_rte(double8);
+short8 __ovld __cnfn convert_short8_rtn(double8);
+short8 __ovld __cnfn convert_short8_rtp(double8);
+short8 __ovld __cnfn convert_short8_rtz(double8);
+short8 __ovld __cnfn convert_short8_sat(double8);
+short8 __ovld __cnfn convert_short8_sat_rte(double8);
+short8 __ovld __cnfn convert_short8_sat_rtn(double8);
+short8 __ovld __cnfn convert_short8_sat_rtp(double8);
+short8 __ovld __cnfn convert_short8_sat_rtz(double8);
+short16 __ovld __cnfn convert_short16(double16);
+short16 __ovld __cnfn convert_short16_rte(double16);
+short16 __ovld __cnfn convert_short16_rtn(double16);
+short16 __ovld __cnfn convert_short16_rtp(double16);
+short16 __ovld __cnfn convert_short16_rtz(double16);
+short16 __ovld __cnfn convert_short16_sat(double16);
+short16 __ovld __cnfn convert_short16_sat_rte(double16);
+short16 __ovld __cnfn convert_short16_sat_rtn(double16);
+short16 __ovld __cnfn convert_short16_sat_rtp(double16);
+short16 __ovld __cnfn convert_short16_sat_rtz(double16);
+
+ushort __ovld __cnfn convert_ushort(double);
+ushort __ovld __cnfn convert_ushort_rte(double);
+ushort __ovld __cnfn convert_ushort_rtn(double);
+ushort __ovld __cnfn convert_ushort_rtp(double);
+ushort __ovld __cnfn convert_ushort_rtz(double);
+ushort __ovld __cnfn convert_ushort_sat(double);
+ushort __ovld __cnfn convert_ushort_sat_rte(double);
+ushort __ovld __cnfn convert_ushort_sat_rtn(double);
+ushort __ovld __cnfn convert_ushort_sat_rtp(double);
+ushort __ovld __cnfn convert_ushort_sat_rtz(double);
+ushort2 __ovld __cnfn convert_ushort2(double2);
+ushort2 __ovld __cnfn convert_ushort2_rte(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(double2);
+ushort3 __ovld __cnfn convert_ushort3(double3);
+ushort3 __ovld __cnfn convert_ushort3_rte(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(double3);
+ushort4 __ovld __cnfn convert_ushort4(double4);
+ushort4 __ovld __cnfn convert_ushort4_rte(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(double4);
+ushort8 __ovld __cnfn convert_ushort8(double8);
+ushort8 __ovld __cnfn convert_ushort8_rte(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(double8);
+ushort16 __ovld __cnfn convert_ushort16(double16);
+ushort16 __ovld __cnfn convert_ushort16_rte(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(double16);
+
+int __ovld __cnfn convert_int(double);
+int __ovld __cnfn convert_int_rte(double);
+int __ovld __cnfn convert_int_rtn(double);
+int __ovld __cnfn convert_int_rtp(double);
+int __ovld __cnfn convert_int_rtz(double);
+int __ovld __cnfn convert_int_sat(double);
+int __ovld __cnfn convert_int_sat_rte(double);
+int __ovld __cnfn convert_int_sat_rtn(double);
+int __ovld __cnfn convert_int_sat_rtp(double);
+int __ovld __cnfn convert_int_sat_rtz(double);
+int2 __ovld __cnfn convert_int2(double2);
+int2 __ovld __cnfn convert_int2_rte(double2);
+int2 __ovld __cnfn convert_int2_rtn(double2);
+int2 __ovld __cnfn convert_int2_rtp(double2);
+int2 __ovld __cnfn convert_int2_rtz(double2);
+int2 __ovld __cnfn convert_int2_sat(double2);
+int2 __ovld __cnfn convert_int2_sat_rte(double2);
+int2 __ovld __cnfn convert_int2_sat_rtn(double2);
+int2 __ovld __cnfn convert_int2_sat_rtp(double2);
+int2 __ovld __cnfn convert_int2_sat_rtz(double2);
+int3 __ovld __cnfn convert_int3(double3);
+int3 __ovld __cnfn convert_int3_rte(double3);
+int3 __ovld __cnfn convert_int3_rtn(double3);
+int3 __ovld __cnfn convert_int3_rtp(double3);
+int3 __ovld __cnfn convert_int3_rtz(double3);
+int3 __ovld __cnfn convert_int3_sat(double3);
+int3 __ovld __cnfn convert_int3_sat_rte(double3);
+int3 __ovld __cnfn convert_int3_sat_rtn(double3);
+int3 __ovld __cnfn convert_int3_sat_rtp(double3);
+int3 __ovld __cnfn convert_int3_sat_rtz(double3);
+int4 __ovld __cnfn convert_int4(double4);
+int4 __ovld __cnfn convert_int4_rte(double4);
+int4 __ovld __cnfn convert_int4_rtn(double4);
+int4 __ovld __cnfn convert_int4_rtp(double4);
+int4 __ovld __cnfn convert_int4_rtz(double4);
+int4 __ovld __cnfn convert_int4_sat(double4);
+int4 __ovld __cnfn convert_int4_sat_rte(double4);
+int4 __ovld __cnfn convert_int4_sat_rtn(double4);
+int4 __ovld __cnfn convert_int4_sat_rtp(double4);
+int4 __ovld __cnfn convert_int4_sat_rtz(double4);
+int8 __ovld __cnfn convert_int8(double8);
+int8 __ovld __cnfn convert_int8_rte(double8);
+int8 __ovld __cnfn convert_int8_rtn(double8);
+int8 __ovld __cnfn convert_int8_rtp(double8);
+int8 __ovld __cnfn convert_int8_rtz(double8);
+int8 __ovld __cnfn convert_int8_sat(double8);
+int8 __ovld __cnfn convert_int8_sat_rte(double8);
+int8 __ovld __cnfn convert_int8_sat_rtn(double8);
+int8 __ovld __cnfn convert_int8_sat_rtp(double8);
+int8 __ovld __cnfn convert_int8_sat_rtz(double8);
+int16 __ovld __cnfn convert_int16(double16);
+int16 __ovld __cnfn convert_int16_rte(double16);
+int16 __ovld __cnfn convert_int16_rtn(double16);
+int16 __ovld __cnfn convert_int16_rtp(double16);
+int16 __ovld __cnfn convert_int16_rtz(double16);
+int16 __ovld __cnfn convert_int16_sat(double16);
+int16 __ovld __cnfn convert_int16_sat_rte(double16);
+int16 __ovld __cnfn convert_int16_sat_rtn(double16);
+int16 __ovld __cnfn convert_int16_sat_rtp(double16);
+int16 __ovld __cnfn convert_int16_sat_rtz(double16);
+
+uint __ovld __cnfn convert_uint(double);
+uint __ovld __cnfn convert_uint_rte(double);
+uint __ovld __cnfn convert_uint_rtn(double);
+uint __ovld __cnfn convert_uint_rtp(double);
+uint __ovld __cnfn convert_uint_rtz(double);
+uint __ovld __cnfn convert_uint_sat(double);
+uint __ovld __cnfn convert_uint_sat_rte(double);
+uint __ovld __cnfn convert_uint_sat_rtn(double);
+uint __ovld __cnfn convert_uint_sat_rtp(double);
+uint __ovld __cnfn convert_uint_sat_rtz(double);
+uint2 __ovld __cnfn convert_uint2(double2);
+uint2 __ovld __cnfn convert_uint2_rte(double2);
+uint2 __ovld __cnfn convert_uint2_rtn(double2);
+uint2 __ovld __cnfn convert_uint2_rtp(double2);
+uint2 __ovld __cnfn convert_uint2_rtz(double2);
+uint2 __ovld __cnfn convert_uint2_sat(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(double2);
+uint3 __ovld __cnfn convert_uint3(double3);
+uint3 __ovld __cnfn convert_uint3_rte(double3);
+uint3 __ovld __cnfn convert_uint3_rtn(double3);
+uint3 __ovld __cnfn convert_uint3_rtp(double3);
+uint3 __ovld __cnfn convert_uint3_rtz(double3);
+uint3 __ovld __cnfn convert_uint3_sat(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(double3);
+uint4 __ovld __cnfn convert_uint4(double4);
+uint4 __ovld __cnfn convert_uint4_rte(double4);
+uint4 __ovld __cnfn convert_uint4_rtn(double4);
+uint4 __ovld __cnfn convert_uint4_rtp(double4);
+uint4 __ovld __cnfn convert_uint4_rtz(double4);
+uint4 __ovld __cnfn convert_uint4_sat(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(double4);
+uint8 __ovld __cnfn convert_uint8(double8);
+uint8 __ovld __cnfn convert_uint8_rte(double8);
+uint8 __ovld __cnfn convert_uint8_rtn(double8);
+uint8 __ovld __cnfn convert_uint8_rtp(double8);
+uint8 __ovld __cnfn convert_uint8_rtz(double8);
+uint8 __ovld __cnfn convert_uint8_sat(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(double8);
+uint16 __ovld __cnfn convert_uint16(double16);
+uint16 __ovld __cnfn convert_uint16_rte(double16);
+uint16 __ovld __cnfn convert_uint16_rtn(double16);
+uint16 __ovld __cnfn convert_uint16_rtp(double16);
+uint16 __ovld __cnfn convert_uint16_rtz(double16);
+uint16 __ovld __cnfn convert_uint16_sat(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(double16);
+
+long __ovld __cnfn convert_long(double);
+long __ovld __cnfn convert_long_rte(double);
+long __ovld __cnfn convert_long_rtn(double);
+long __ovld __cnfn convert_long_rtp(double);
+long __ovld __cnfn convert_long_rtz(double);
+long __ovld __cnfn convert_long_sat(double);
+long __ovld __cnfn convert_long_sat_rte(double);
+long __ovld __cnfn convert_long_sat_rtn(double);
+long __ovld __cnfn convert_long_sat_rtp(double);
+long __ovld __cnfn convert_long_sat_rtz(double);
+long2 __ovld __cnfn convert_long2(double2);
+long2 __ovld __cnfn convert_long2_rte(double2);
+long2 __ovld __cnfn convert_long2_rtn(double2);
+long2 __ovld __cnfn convert_long2_rtp(double2);
+long2 __ovld __cnfn convert_long2_rtz(double2);
+long2 __ovld __cnfn convert_long2_sat(double2);
+long2 __ovld __cnfn convert_long2_sat_rte(double2);
+long2 __ovld __cnfn convert_long2_sat_rtn(double2);
+long2 __ovld __cnfn convert_long2_sat_rtp(double2);
+long2 __ovld __cnfn convert_long2_sat_rtz(double2);
+long3 __ovld __cnfn convert_long3(double3);
+long3 __ovld __cnfn convert_long3_rte(double3);
+long3 __ovld __cnfn convert_long3_rtn(double3);
+long3 __ovld __cnfn convert_long3_rtp(double3);
+long3 __ovld __cnfn convert_long3_rtz(double3);
+long3 __ovld __cnfn convert_long3_sat(double3);
+long3 __ovld __cnfn convert_long3_sat_rte(double3);
+long3 __ovld __cnfn convert_long3_sat_rtn(double3);
+long3 __ovld __cnfn convert_long3_sat_rtp(double3);
+long3 __ovld __cnfn convert_long3_sat_rtz(double3);
+long4 __ovld __cnfn convert_long4(double4);
+long4 __ovld __cnfn convert_long4_rte(double4);
+long4 __ovld __cnfn convert_long4_rtn(double4);
+long4 __ovld __cnfn convert_long4_rtp(double4);
+long4 __ovld __cnfn convert_long4_rtz(double4);
+long4 __ovld __cnfn convert_long4_sat(double4);
+long4 __ovld __cnfn convert_long4_sat_rte(double4);
+long4 __ovld __cnfn convert_long4_sat_rtn(double4);
+long4 __ovld __cnfn convert_long4_sat_rtp(double4);
+long4 __ovld __cnfn convert_long4_sat_rtz(double4);
+long8 __ovld __cnfn convert_long8(double8);
+long8 __ovld __cnfn convert_long8_rte(double8);
+long8 __ovld __cnfn convert_long8_rtn(double8);
+long8 __ovld __cnfn convert_long8_rtp(double8);
+long8 __ovld __cnfn convert_long8_rtz(double8);
+long8 __ovld __cnfn convert_long8_sat(double8);
+long8 __ovld __cnfn convert_long8_sat_rte(double8);
+long8 __ovld __cnfn convert_long8_sat_rtn(double8);
+long8 __ovld __cnfn convert_long8_sat_rtp(double8);
+long8 __ovld __cnfn convert_long8_sat_rtz(double8);
+long16 __ovld __cnfn convert_long16(double16);
+long16 __ovld __cnfn convert_long16_rte(double16);
+long16 __ovld __cnfn convert_long16_rtn(double16);
+long16 __ovld __cnfn convert_long16_rtp(double16);
+long16 __ovld __cnfn convert_long16_rtz(double16);
+long16 __ovld __cnfn convert_long16_sat(double16);
+long16 __ovld __cnfn convert_long16_sat_rte(double16);
+long16 __ovld __cnfn convert_long16_sat_rtn(double16);
+long16 __ovld __cnfn convert_long16_sat_rtp(double16);
+long16 __ovld __cnfn convert_long16_sat_rtz(double16);
+
+ulong __ovld __cnfn convert_ulong(double);
+ulong __ovld __cnfn convert_ulong_rte(double);
+ulong __ovld __cnfn convert_ulong_rtn(double);
+ulong __ovld __cnfn convert_ulong_rtp(double);
+ulong __ovld __cnfn convert_ulong_rtz(double);
+ulong __ovld __cnfn convert_ulong_sat(double);
+ulong __ovld __cnfn convert_ulong_sat_rte(double);
+ulong __ovld __cnfn convert_ulong_sat_rtn(double);
+ulong __ovld __cnfn convert_ulong_sat_rtp(double);
+ulong __ovld __cnfn convert_ulong_sat_rtz(double);
+ulong2 __ovld __cnfn convert_ulong2(double2);
+ulong2 __ovld __cnfn convert_ulong2_rte(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(double2);
+ulong3 __ovld __cnfn convert_ulong3(double3);
+ulong3 __ovld __cnfn convert_ulong3_rte(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(double3);
+ulong4 __ovld __cnfn convert_ulong4(double4);
+ulong4 __ovld __cnfn convert_ulong4_rte(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(double4);
+ulong8 __ovld __cnfn convert_ulong8(double8);
+ulong8 __ovld __cnfn convert_ulong8_rte(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(double8);
+ulong16 __ovld __cnfn convert_ulong16(double16);
+ulong16 __ovld __cnfn convert_ulong16_rte(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(double16);
+
+float __ovld __cnfn convert_float(double);
+float __ovld __cnfn convert_float_rte(double);
+float __ovld __cnfn convert_float_rtn(double);
+float __ovld __cnfn convert_float_rtp(double);
+float __ovld __cnfn convert_float_rtz(double);
+float2 __ovld __cnfn convert_float2(double2);
+float2 __ovld __cnfn convert_float2_rte(double2);
+float2 __ovld __cnfn convert_float2_rtn(double2);
+float2 __ovld __cnfn convert_float2_rtp(double2);
+float2 __ovld __cnfn convert_float2_rtz(double2);
+float3 __ovld __cnfn convert_float3(double3);
+float3 __ovld __cnfn convert_float3_rte(double3);
+float3 __ovld __cnfn convert_float3_rtn(double3);
+float3 __ovld __cnfn convert_float3_rtp(double3);
+float3 __ovld __cnfn convert_float3_rtz(double3);
+float4 __ovld __cnfn convert_float4(double4);
+float4 __ovld __cnfn convert_float4_rte(double4);
+float4 __ovld __cnfn convert_float4_rtn(double4);
+float4 __ovld __cnfn convert_float4_rtp(double4);
+float4 __ovld __cnfn convert_float4_rtz(double4);
+float8 __ovld __cnfn convert_float8(double8);
+float8 __ovld __cnfn convert_float8_rte(double8);
+float8 __ovld __cnfn convert_float8_rtn(double8);
+float8 __ovld __cnfn convert_float8_rtp(double8);
+float8 __ovld __cnfn convert_float8_rtz(double8);
+float16 __ovld __cnfn convert_float16(double16);
+float16 __ovld __cnfn convert_float16_rte(double16);
+float16 __ovld __cnfn convert_float16_rtn(double16);
+float16 __ovld __cnfn convert_float16_rtp(double16);
+float16 __ovld __cnfn convert_float16_rtz(double16);
+
+double __ovld __cnfn convert_double(char);
+double __ovld __cnfn convert_double(double);
+double __ovld __cnfn convert_double(float);
+double __ovld __cnfn convert_double(int);
+double __ovld __cnfn convert_double(long);
+double __ovld __cnfn convert_double(short);
+double __ovld __cnfn convert_double(uchar);
+double __ovld __cnfn convert_double(uint);
+double __ovld __cnfn convert_double(ulong);
+double __ovld __cnfn convert_double(ushort);
+double __ovld __cnfn convert_double_rte(char);
+double __ovld __cnfn convert_double_rte(double);
+double __ovld __cnfn convert_double_rte(float);
+double __ovld __cnfn convert_double_rte(int);
+double __ovld __cnfn convert_double_rte(long);
+double __ovld __cnfn convert_double_rte(short);
+double __ovld __cnfn convert_double_rte(uchar);
+double __ovld __cnfn convert_double_rte(uint);
+double __ovld __cnfn convert_double_rte(ulong);
+double __ovld __cnfn convert_double_rte(ushort);
+double __ovld __cnfn convert_double_rtn(char);
+double __ovld __cnfn convert_double_rtn(double);
+double __ovld __cnfn convert_double_rtn(float);
+double __ovld __cnfn convert_double_rtn(int);
+double __ovld __cnfn convert_double_rtn(long);
+double __ovld __cnfn convert_double_rtn(short);
+double __ovld __cnfn convert_double_rtn(uchar);
+double __ovld __cnfn convert_double_rtn(uint);
+double __ovld __cnfn convert_double_rtn(ulong);
+double __ovld __cnfn convert_double_rtn(ushort);
+double __ovld __cnfn convert_double_rtp(char);
+double __ovld __cnfn convert_double_rtp(double);
+double __ovld __cnfn convert_double_rtp(float);
+double __ovld __cnfn convert_double_rtp(int);
+double __ovld __cnfn convert_double_rtp(long);
+double __ovld __cnfn convert_double_rtp(short);
+double __ovld __cnfn convert_double_rtp(uchar);
+double __ovld __cnfn convert_double_rtp(uint);
+double __ovld __cnfn convert_double_rtp(ulong);
+double __ovld __cnfn convert_double_rtp(ushort);
+double __ovld __cnfn convert_double_rtz(char);
+double __ovld __cnfn convert_double_rtz(double);
+double __ovld __cnfn convert_double_rtz(float);
+double __ovld __cnfn convert_double_rtz(int);
+double __ovld __cnfn convert_double_rtz(long);
+double __ovld __cnfn convert_double_rtz(short);
+double __ovld __cnfn convert_double_rtz(uchar);
+double __ovld __cnfn convert_double_rtz(uint);
+double __ovld __cnfn convert_double_rtz(ulong);
+double __ovld __cnfn convert_double_rtz(ushort);
+double2 __ovld __cnfn convert_double2(char2);
+double2 __ovld __cnfn convert_double2(double2);
+double2 __ovld __cnfn convert_double2(float2);
+double2 __ovld __cnfn convert_double2(int2);
+double2 __ovld __cnfn convert_double2(long2);
+double2 __ovld __cnfn convert_double2(short2);
+double2 __ovld __cnfn convert_double2(uchar2);
+double2 __ovld __cnfn convert_double2(uint2);
+double2 __ovld __cnfn convert_double2(ulong2);
+double2 __ovld __cnfn convert_double2(ushort2);
+double2 __ovld __cnfn convert_double2_rte(char2);
+double2 __ovld __cnfn convert_double2_rte(double2);
+double2 __ovld __cnfn convert_double2_rte(float2);
+double2 __ovld __cnfn convert_double2_rte(int2);
+double2 __ovld __cnfn convert_double2_rte(long2);
+double2 __ovld __cnfn convert_double2_rte(short2);
+double2 __ovld __cnfn convert_double2_rte(uchar2);
+double2 __ovld __cnfn convert_double2_rte(uint2);
+double2 __ovld __cnfn convert_double2_rte(ulong2);
+double2 __ovld __cnfn convert_double2_rte(ushort2);
+double2 __ovld __cnfn convert_double2_rtn(char2);
+double2 __ovld __cnfn convert_double2_rtn(double2);
+double2 __ovld __cnfn convert_double2_rtn(float2);
+double2 __ovld __cnfn convert_double2_rtn(int2);
+double2 __ovld __cnfn convert_double2_rtn(long2);
+double2 __ovld __cnfn convert_double2_rtn(short2);
+double2 __ovld __cnfn convert_double2_rtn(uchar2);
+double2 __ovld __cnfn convert_double2_rtn(uint2);
+double2 __ovld __cnfn convert_double2_rtn(ulong2);
+double2 __ovld __cnfn convert_double2_rtn(ushort2);
+double2 __ovld __cnfn convert_double2_rtp(char2);
+double2 __ovld __cnfn convert_double2_rtp(double2);
+double2 __ovld __cnfn convert_double2_rtp(float2);
+double2 __ovld __cnfn convert_double2_rtp(int2);
+double2 __ovld __cnfn convert_double2_rtp(long2);
+double2 __ovld __cnfn convert_double2_rtp(short2);
+double2 __ovld __cnfn convert_double2_rtp(uchar2);
+double2 __ovld __cnfn convert_double2_rtp(uint2);
+double2 __ovld __cnfn convert_double2_rtp(ulong2);
+double2 __ovld __cnfn convert_double2_rtp(ushort2);
+double2 __ovld __cnfn convert_double2_rtz(char2);
+double2 __ovld __cnfn convert_double2_rtz(double2);
+double2 __ovld __cnfn convert_double2_rtz(float2);
+double2 __ovld __cnfn convert_double2_rtz(int2);
+double2 __ovld __cnfn convert_double2_rtz(long2);
+double2 __ovld __cnfn convert_double2_rtz(short2);
+double2 __ovld __cnfn convert_double2_rtz(uchar2);
+double2 __ovld __cnfn convert_double2_rtz(uint2);
+double2 __ovld __cnfn convert_double2_rtz(ulong2);
+double2 __ovld __cnfn convert_double2_rtz(ushort2);
+double3 __ovld __cnfn convert_double3(char3);
+double3 __ovld __cnfn convert_double3(double3);
+double3 __ovld __cnfn convert_double3(float3);
+double3 __ovld __cnfn convert_double3(int3);
+double3 __ovld __cnfn convert_double3(long3);
+double3 __ovld __cnfn convert_double3(short3);
+double3 __ovld __cnfn convert_double3(uchar3);
+double3 __ovld __cnfn convert_double3(uint3);
+double3 __ovld __cnfn convert_double3(ulong3);
+double3 __ovld __cnfn convert_double3(ushort3);
+double3 __ovld __cnfn convert_double3_rte(char3);
+double3 __ovld __cnfn convert_double3_rte(double3);
+double3 __ovld __cnfn convert_double3_rte(float3);
+double3 __ovld __cnfn convert_double3_rte(int3);
+double3 __ovld __cnfn convert_double3_rte(long3);
+double3 __ovld __cnfn convert_double3_rte(short3);
+double3 __ovld __cnfn convert_double3_rte(uchar3);
+double3 __ovld __cnfn convert_double3_rte(uint3);
+double3 __ovld __cnfn convert_double3_rte(ulong3);
+double3 __ovld __cnfn convert_double3_rte(ushort3);
+double3 __ovld __cnfn convert_double3_rtn(char3);
+double3 __ovld __cnfn convert_double3_rtn(double3);
+double3 __ovld __cnfn convert_double3_rtn(float3);
+double3 __ovld __cnfn convert_double3_rtn(int3);
+double3 __ovld __cnfn convert_double3_rtn(long3);
+double3 __ovld __cnfn convert_double3_rtn(short3);
+double3 __ovld __cnfn convert_double3_rtn(uchar3);
+double3 __ovld __cnfn convert_double3_rtn(uint3);
+double3 __ovld __cnfn convert_double3_rtn(ulong3);
+double3 __ovld __cnfn convert_double3_rtn(ushort3);
+double3 __ovld __cnfn convert_double3_rtp(char3);
+double3 __ovld __cnfn convert_double3_rtp(double3);
+double3 __ovld __cnfn convert_double3_rtp(float3);
+double3 __ovld __cnfn convert_double3_rtp(int3);
+double3 __ovld __cnfn convert_double3_rtp(long3);
+double3 __ovld __cnfn convert_double3_rtp(short3);
+double3 __ovld __cnfn convert_double3_rtp(uchar3);
+double3 __ovld __cnfn convert_double3_rtp(uint3);
+double3 __ovld __cnfn convert_double3_rtp(ulong3);
+double3 __ovld __cnfn convert_double3_rtp(ushort3);
+double3 __ovld __cnfn convert_double3_rtz(char3);
+double3 __ovld __cnfn convert_double3_rtz(double3);
+double3 __ovld __cnfn convert_double3_rtz(float3);
+double3 __ovld __cnfn convert_double3_rtz(int3);
+double3 __ovld __cnfn convert_double3_rtz(long3);
+double3 __ovld __cnfn convert_double3_rtz(short3);
+double3 __ovld __cnfn convert_double3_rtz(uchar3);
+double3 __ovld __cnfn convert_double3_rtz(uint3);
+double3 __ovld __cnfn convert_double3_rtz(ulong3);
+double3 __ovld __cnfn convert_double3_rtz(ushort3);
+double4 __ovld __cnfn convert_double4(char4);
+double4 __ovld __cnfn convert_double4(double4);
+double4 __ovld __cnfn convert_double4(float4);
+double4 __ovld __cnfn convert_double4(int4);
+double4 __ovld __cnfn convert_double4(long4);
+double4 __ovld __cnfn convert_double4(short4);
+double4 __ovld __cnfn convert_double4(uchar4);
+double4 __ovld __cnfn convert_double4(uint4);
+double4 __ovld __cnfn convert_double4(ulong4);
+double4 __ovld __cnfn convert_double4(ushort4);
+double4 __ovld __cnfn convert_double4_rte(char4);
+double4 __ovld __cnfn convert_double4_rte(double4);
+double4 __ovld __cnfn convert_double4_rte(float4);
+double4 __ovld __cnfn convert_double4_rte(int4);
+double4 __ovld __cnfn convert_double4_rte(long4);
+double4 __ovld __cnfn convert_double4_rte(short4);
+double4 __ovld __cnfn convert_double4_rte(uchar4);
+double4 __ovld __cnfn convert_double4_rte(uint4);
+double4 __ovld __cnfn convert_double4_rte(ulong4);
+double4 __ovld __cnfn convert_double4_rte(ushort4);
+double4 __ovld __cnfn convert_double4_rtn(char4);
+double4 __ovld __cnfn convert_double4_rtn(double4);
+double4 __ovld __cnfn convert_double4_rtn(float4);
+double4 __ovld __cnfn convert_double4_rtn(int4);
+double4 __ovld __cnfn convert_double4_rtn(long4);
+double4 __ovld __cnfn convert_double4_rtn(short4);
+double4 __ovld __cnfn convert_double4_rtn(uchar4);
+double4 __ovld __cnfn convert_double4_rtn(uint4);
+double4 __ovld __cnfn convert_double4_rtn(ulong4);
+double4 __ovld __cnfn convert_double4_rtn(ushort4);
+double4 __ovld __cnfn convert_double4_rtp(char4);
+double4 __ovld __cnfn convert_double4_rtp(double4);
+double4 __ovld __cnfn convert_double4_rtp(float4);
+double4 __ovld __cnfn convert_double4_rtp(int4);
+double4 __ovld __cnfn convert_double4_rtp(long4);
+double4 __ovld __cnfn convert_double4_rtp(short4);
+double4 __ovld __cnfn convert_double4_rtp(uchar4);
+double4 __ovld __cnfn convert_double4_rtp(uint4);
+double4 __ovld __cnfn convert_double4_rtp(ulong4);
+double4 __ovld __cnfn convert_double4_rtp(ushort4);
+double4 __ovld __cnfn convert_double4_rtz(char4);
+double4 __ovld __cnfn convert_double4_rtz(double4);
+double4 __ovld __cnfn convert_double4_rtz(float4);
+double4 __ovld __cnfn convert_double4_rtz(int4);
+double4 __ovld __cnfn convert_double4_rtz(long4);
+double4 __ovld __cnfn convert_double4_rtz(short4);
+double4 __ovld __cnfn convert_double4_rtz(uchar4);
+double4 __ovld __cnfn convert_double4_rtz(uint4);
+double4 __ovld __cnfn convert_double4_rtz(ulong4);
+double4 __ovld __cnfn convert_double4_rtz(ushort4);
+double8 __ovld __cnfn convert_double8(char8);
+double8 __ovld __cnfn convert_double8(double8);
+double8 __ovld __cnfn convert_double8(float8);
+double8 __ovld __cnfn convert_double8(int8);
+double8 __ovld __cnfn convert_double8(long8);
+double8 __ovld __cnfn convert_double8(short8);
+double8 __ovld __cnfn convert_double8(uchar8);
+double8 __ovld __cnfn convert_double8(uint8);
+double8 __ovld __cnfn convert_double8(ulong8);
+double8 __ovld __cnfn convert_double8(ushort8);
+double8 __ovld __cnfn convert_double8_rte(char8);
+double8 __ovld __cnfn convert_double8_rte(double8);
+double8 __ovld __cnfn convert_double8_rte(float8);
+double8 __ovld __cnfn convert_double8_rte(int8);
+double8 __ovld __cnfn convert_double8_rte(long8);
+double8 __ovld __cnfn convert_double8_rte(short8);
+double8 __ovld __cnfn convert_double8_rte(uchar8);
+double8 __ovld __cnfn convert_double8_rte(uint8);
+double8 __ovld __cnfn convert_double8_rte(ulong8);
+double8 __ovld __cnfn convert_double8_rte(ushort8);
+double8 __ovld __cnfn convert_double8_rtn(char8);
+double8 __ovld __cnfn convert_double8_rtn(double8);
+double8 __ovld __cnfn convert_double8_rtn(float8);
+double8 __ovld __cnfn convert_double8_rtn(int8);
+double8 __ovld __cnfn convert_double8_rtn(long8);
+double8 __ovld __cnfn convert_double8_rtn(short8);
+double8 __ovld __cnfn convert_double8_rtn(uchar8);
+double8 __ovld __cnfn convert_double8_rtn(uint8);
+double8 __ovld __cnfn convert_double8_rtn(ulong8);
+double8 __ovld __cnfn convert_double8_rtn(ushort8);
+double8 __ovld __cnfn convert_double8_rtp(char8);
+double8 __ovld __cnfn convert_double8_rtp(double8);
+double8 __ovld __cnfn convert_double8_rtp(float8);
+double8 __ovld __cnfn convert_double8_rtp(int8);
+double8 __ovld __cnfn convert_double8_rtp(long8);
+double8 __ovld __cnfn convert_double8_rtp(short8);
+double8 __ovld __cnfn convert_double8_rtp(uchar8);
+double8 __ovld __cnfn convert_double8_rtp(uint8);
+double8 __ovld __cnfn convert_double8_rtp(ulong8);
+double8 __ovld __cnfn convert_double8_rtp(ushort8);
+double8 __ovld __cnfn convert_double8_rtz(char8);
+double8 __ovld __cnfn convert_double8_rtz(double8);
+double8 __ovld __cnfn convert_double8_rtz(float8);
+double8 __ovld __cnfn convert_double8_rtz(int8);
+double8 __ovld __cnfn convert_double8_rtz(long8);
+double8 __ovld __cnfn convert_double8_rtz(short8);
+double8 __ovld __cnfn convert_double8_rtz(uchar8);
+double8 __ovld __cnfn convert_double8_rtz(uint8);
+double8 __ovld __cnfn convert_double8_rtz(ulong8);
+double8 __ovld __cnfn convert_double8_rtz(ushort8);
+double16 __ovld __cnfn convert_double16(char16);
+double16 __ovld __cnfn convert_double16(double16);
+double16 __ovld __cnfn convert_double16(float16);
+double16 __ovld __cnfn convert_double16(int16);
+double16 __ovld __cnfn convert_double16(long16);
+double16 __ovld __cnfn convert_double16(short16);
+double16 __ovld __cnfn convert_double16(uchar16);
+double16 __ovld __cnfn convert_double16(uint16);
+double16 __ovld __cnfn convert_double16(ulong16);
+double16 __ovld __cnfn convert_double16(ushort16);
+double16 __ovld __cnfn convert_double16_rte(char16);
+double16 __ovld __cnfn convert_double16_rte(double16);
+double16 __ovld __cnfn convert_double16_rte(float16);
+double16 __ovld __cnfn convert_double16_rte(int16);
+double16 __ovld __cnfn convert_double16_rte(long16);
+double16 __ovld __cnfn convert_double16_rte(short16);
+double16 __ovld __cnfn convert_double16_rte(uchar16);
+double16 __ovld __cnfn convert_double16_rte(uint16);
+double16 __ovld __cnfn convert_double16_rte(ulong16);
+double16 __ovld __cnfn convert_double16_rte(ushort16);
+double16 __ovld __cnfn convert_double16_rtn(char16);
+double16 __ovld __cnfn convert_double16_rtn(double16);
+double16 __ovld __cnfn convert_double16_rtn(float16);
+double16 __ovld __cnfn convert_double16_rtn(int16);
+double16 __ovld __cnfn convert_double16_rtn(long16);
+double16 __ovld __cnfn convert_double16_rtn(short16);
+double16 __ovld __cnfn convert_double16_rtn(uchar16);
+double16 __ovld __cnfn convert_double16_rtn(uint16);
+double16 __ovld __cnfn convert_double16_rtn(ulong16);
+double16 __ovld __cnfn convert_double16_rtn(ushort16);
+double16 __ovld __cnfn convert_double16_rtp(char16);
+double16 __ovld __cnfn convert_double16_rtp(double16);
+double16 __ovld __cnfn convert_double16_rtp(float16);
+double16 __ovld __cnfn convert_double16_rtp(int16);
+double16 __ovld __cnfn convert_double16_rtp(long16);
+double16 __ovld __cnfn convert_double16_rtp(short16);
+double16 __ovld __cnfn convert_double16_rtp(uchar16);
+double16 __ovld __cnfn convert_double16_rtp(uint16);
+double16 __ovld __cnfn convert_double16_rtp(ulong16);
+double16 __ovld __cnfn convert_double16_rtp(ushort16);
+double16 __ovld __cnfn convert_double16_rtz(char16);
+double16 __ovld __cnfn convert_double16_rtz(double16);
+double16 __ovld __cnfn convert_double16_rtz(float16);
+double16 __ovld __cnfn convert_double16_rtz(int16);
+double16 __ovld __cnfn convert_double16_rtz(long16);
+double16 __ovld __cnfn convert_double16_rtz(short16);
+double16 __ovld __cnfn convert_double16_rtz(uchar16);
+double16 __ovld __cnfn convert_double16_rtz(uint16);
+double16 __ovld __cnfn convert_double16_rtz(ulong16);
+double16 __ovld __cnfn convert_double16_rtz(ushort16);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+// Convert half types to non-double types.
+uchar __ovld __cnfn convert_uchar(half);
+uchar __ovld __cnfn convert_uchar_rte(half);
+uchar __ovld __cnfn convert_uchar_rtp(half);
+uchar __ovld __cnfn convert_uchar_rtn(half);
+uchar __ovld __cnfn convert_uchar_rtz(half);
+uchar __ovld __cnfn convert_uchar_sat(half);
+uchar __ovld __cnfn convert_uchar_sat_rte(half);
+uchar __ovld __cnfn convert_uchar_sat_rtp(half);
+uchar __ovld __cnfn convert_uchar_sat_rtn(half);
+uchar __ovld __cnfn convert_uchar_sat_rtz(half);
+uchar2 __ovld __cnfn convert_uchar2(half2);
+uchar2 __ovld __cnfn convert_uchar2_rte(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(half2);
+uchar3 __ovld __cnfn convert_uchar3(half3);
+uchar3 __ovld __cnfn convert_uchar3_rte(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(half3);
+uchar4 __ovld __cnfn convert_uchar4(half4);
+uchar4 __ovld __cnfn convert_uchar4_rte(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(half4);
+uchar8 __ovld __cnfn convert_uchar8(half8);
+uchar8 __ovld __cnfn convert_uchar8_rte(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(half8);
+uchar16 __ovld __cnfn convert_uchar16(half16);
+uchar16 __ovld __cnfn convert_uchar16_rte(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(half16);
+ushort __ovld __cnfn convert_ushort(half);
+ushort __ovld __cnfn convert_ushort_rte(half);
+ushort __ovld __cnfn convert_ushort_rtp(half);
+ushort __ovld __cnfn convert_ushort_rtn(half);
+ushort __ovld __cnfn convert_ushort_rtz(half);
+ushort __ovld __cnfn convert_ushort_sat(half);
+ushort __ovld __cnfn convert_ushort_sat_rte(half);
+ushort __ovld __cnfn convert_ushort_sat_rtp(half);
+ushort __ovld __cnfn convert_ushort_sat_rtn(half);
+ushort __ovld __cnfn convert_ushort_sat_rtz(half);
+ushort2 __ovld __cnfn convert_ushort2(half2);
+ushort2 __ovld __cnfn convert_ushort2_rte(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(half2);
+ushort3 __ovld __cnfn convert_ushort3(half3);
+ushort3 __ovld __cnfn convert_ushort3_rte(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(half3);
+ushort4 __ovld __cnfn convert_ushort4(half4);
+ushort4 __ovld __cnfn convert_ushort4_rte(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(half4);
+ushort8 __ovld __cnfn convert_ushort8(half8);
+ushort8 __ovld __cnfn convert_ushort8_rte(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(half8);
+ushort16 __ovld __cnfn convert_ushort16(half16);
+ushort16 __ovld __cnfn convert_ushort16_rte(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(half16);
+uint __ovld __cnfn convert_uint(half);
+uint __ovld __cnfn convert_uint_rte(half);
+uint __ovld __cnfn convert_uint_rtp(half);
+uint __ovld __cnfn convert_uint_rtn(half);
+uint __ovld __cnfn convert_uint_rtz(half);
+uint __ovld __cnfn convert_uint_sat(half);
+uint __ovld __cnfn convert_uint_sat_rte(half);
+uint __ovld __cnfn convert_uint_sat_rtp(half);
+uint __ovld __cnfn convert_uint_sat_rtn(half);
+uint __ovld __cnfn convert_uint_sat_rtz(half);
+uint2 __ovld __cnfn convert_uint2(half2);
+uint2 __ovld __cnfn convert_uint2_rte(half2);
+uint2 __ovld __cnfn convert_uint2_rtp(half2);
+uint2 __ovld __cnfn convert_uint2_rtn(half2);
+uint2 __ovld __cnfn convert_uint2_rtz(half2);
+uint2 __ovld __cnfn convert_uint2_sat(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(half2);
+uint3 __ovld __cnfn convert_uint3(half3);
+uint3 __ovld __cnfn convert_uint3_rte(half3);
+uint3 __ovld __cnfn convert_uint3_rtp(half3);
+uint3 __ovld __cnfn convert_uint3_rtn(half3);
+uint3 __ovld __cnfn convert_uint3_rtz(half3);
+uint3 __ovld __cnfn convert_uint3_sat(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(half3);
+uint4 __ovld __cnfn convert_uint4(half4);
+uint4 __ovld __cnfn convert_uint4_rte(half4);
+uint4 __ovld __cnfn convert_uint4_rtp(half4);
+uint4 __ovld __cnfn convert_uint4_rtn(half4);
+uint4 __ovld __cnfn convert_uint4_rtz(half4);
+uint4 __ovld __cnfn convert_uint4_sat(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(half4);
+uint8 __ovld __cnfn convert_uint8(half8);
+uint8 __ovld __cnfn convert_uint8_rte(half8);
+uint8 __ovld __cnfn convert_uint8_rtp(half8);
+uint8 __ovld __cnfn convert_uint8_rtn(half8);
+uint8 __ovld __cnfn convert_uint8_rtz(half8);
+uint8 __ovld __cnfn convert_uint8_sat(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(half8);
+uint16 __ovld __cnfn convert_uint16(half16);
+uint16 __ovld __cnfn convert_uint16_rte(half16);
+uint16 __ovld __cnfn convert_uint16_rtp(half16);
+uint16 __ovld __cnfn convert_uint16_rtn(half16);
+uint16 __ovld __cnfn convert_uint16_rtz(half16);
+uint16 __ovld __cnfn convert_uint16_sat(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(half16);
+ulong __ovld __cnfn convert_ulong(half);
+ulong __ovld __cnfn convert_ulong_rte(half);
+ulong __ovld __cnfn convert_ulong_rtp(half);
+ulong __ovld __cnfn convert_ulong_rtn(half);
+ulong __ovld __cnfn convert_ulong_rtz(half);
+ulong __ovld __cnfn convert_ulong_sat(half);
+ulong __ovld __cnfn convert_ulong_sat_rte(half);
+ulong __ovld __cnfn convert_ulong_sat_rtp(half);
+ulong __ovld __cnfn convert_ulong_sat_rtn(half);
+ulong __ovld __cnfn convert_ulong_sat_rtz(half);
+ulong2 __ovld __cnfn convert_ulong2(half2);
+ulong2 __ovld __cnfn convert_ulong2_rte(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(half2);
+ulong3 __ovld __cnfn convert_ulong3(half3);
+ulong3 __ovld __cnfn convert_ulong3_rte(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(half3);
+ulong4 __ovld __cnfn convert_ulong4(half4);
+ulong4 __ovld __cnfn convert_ulong4_rte(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(half4);
+ulong8 __ovld __cnfn convert_ulong8(half8);
+ulong8 __ovld __cnfn convert_ulong8_rte(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(half8);
+ulong16 __ovld __cnfn convert_ulong16(half16);
+ulong16 __ovld __cnfn convert_ulong16_rte(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(half16);
+char __ovld __cnfn convert_char(half);
+char __ovld __cnfn convert_char_rte(half);
+char __ovld __cnfn convert_char_rtp(half);
+char __ovld __cnfn convert_char_rtn(half);
+char __ovld __cnfn convert_char_rtz(half);
+char __ovld __cnfn convert_char_sat(half);
+char __ovld __cnfn convert_char_sat_rte(half);
+char __ovld __cnfn convert_char_sat_rtp(half);
+char __ovld __cnfn convert_char_sat_rtn(half);
+char __ovld __cnfn convert_char_sat_rtz(half);
+char2 __ovld __cnfn convert_char2(half2);
+char2 __ovld __cnfn convert_char2_rte(half2);
+char2 __ovld __cnfn convert_char2_rtp(half2);
+char2 __ovld __cnfn convert_char2_rtn(half2);
+char2 __ovld __cnfn convert_char2_rtz(half2);
+char2 __ovld __cnfn convert_char2_sat(half2);
+char2 __ovld __cnfn convert_char2_sat_rte(half2);
+char2 __ovld __cnfn convert_char2_sat_rtp(half2);
+char2 __ovld __cnfn convert_char2_sat_rtn(half2);
+char2 __ovld __cnfn convert_char2_sat_rtz(half2);
+char3 __ovld __cnfn convert_char3(half3);
+char3 __ovld __cnfn convert_char3_rte(half3);
+char3 __ovld __cnfn convert_char3_rtp(half3);
+char3 __ovld __cnfn convert_char3_rtn(half3);
+char3 __ovld __cnfn convert_char3_rtz(half3);
+char3 __ovld __cnfn convert_char3_sat(half3);
+char3 __ovld __cnfn convert_char3_sat_rte(half3);
+char3 __ovld __cnfn convert_char3_sat_rtp(half3);
+char3 __ovld __cnfn convert_char3_sat_rtn(half3);
+char3 __ovld __cnfn convert_char3_sat_rtz(half3);
+char4 __ovld __cnfn convert_char4(half4);
+char4 __ovld __cnfn convert_char4_rte(half4);
+char4 __ovld __cnfn convert_char4_rtp(half4);
+char4 __ovld __cnfn convert_char4_rtn(half4);
+char4 __ovld __cnfn convert_char4_rtz(half4);
+char4 __ovld __cnfn convert_char4_sat(half4);
+char4 __ovld __cnfn convert_char4_sat_rte(half4);
+char4 __ovld __cnfn convert_char4_sat_rtp(half4);
+char4 __ovld __cnfn convert_char4_sat_rtn(half4);
+char4 __ovld __cnfn convert_char4_sat_rtz(half4);
+char8 __ovld __cnfn convert_char8(half8);
+char8 __ovld __cnfn convert_char8_rte(half8);
+char8 __ovld __cnfn convert_char8_rtp(half8);
+char8 __ovld __cnfn convert_char8_rtn(half8);
+char8 __ovld __cnfn convert_char8_rtz(half8);
+char8 __ovld __cnfn convert_char8_sat(half8);
+char8 __ovld __cnfn convert_char8_sat_rte(half8);
+char8 __ovld __cnfn convert_char8_sat_rtp(half8);
+char8 __ovld __cnfn convert_char8_sat_rtn(half8);
+char8 __ovld __cnfn convert_char8_sat_rtz(half8);
+char16 __ovld __cnfn convert_char16(half16);
+char16 __ovld __cnfn convert_char16_rte(half16);
+char16 __ovld __cnfn convert_char16_rtp(half16);
+char16 __ovld __cnfn convert_char16_rtn(half16);
+char16 __ovld __cnfn convert_char16_rtz(half16);
+char16 __ovld __cnfn convert_char16_sat(half16);
+char16 __ovld __cnfn convert_char16_sat_rte(half16);
+char16 __ovld __cnfn convert_char16_sat_rtp(half16);
+char16 __ovld __cnfn convert_char16_sat_rtn(half16);
+char16 __ovld __cnfn convert_char16_sat_rtz(half16);
+short __ovld __cnfn convert_short(half);
+short __ovld __cnfn convert_short_rte(half);
+short __ovld __cnfn convert_short_rtp(half);
+short __ovld __cnfn convert_short_rtn(half);
+short __ovld __cnfn convert_short_rtz(half);
+short __ovld __cnfn convert_short_sat(half);
+short __ovld __cnfn convert_short_sat_rte(half);
+short __ovld __cnfn convert_short_sat_rtp(half);
+short __ovld __cnfn convert_short_sat_rtn(half);
+short __ovld __cnfn convert_short_sat_rtz(half);
+short2 __ovld __cnfn convert_short2(half2);
+short2 __ovld __cnfn convert_short2_rte(half2);
+short2 __ovld __cnfn convert_short2_rtp(half2);
+short2 __ovld __cnfn convert_short2_rtn(half2);
+short2 __ovld __cnfn convert_short2_rtz(half2);
+short2 __ovld __cnfn convert_short2_sat(half2);
+short2 __ovld __cnfn convert_short2_sat_rte(half2);
+short2 __ovld __cnfn convert_short2_sat_rtp(half2);
+short2 __ovld __cnfn convert_short2_sat_rtn(half2);
+short2 __ovld __cnfn convert_short2_sat_rtz(half2);
+short3 __ovld __cnfn convert_short3(half3);
+short3 __ovld __cnfn convert_short3_rte(half3);
+short3 __ovld __cnfn convert_short3_rtp(half3);
+short3 __ovld __cnfn convert_short3_rtn(half3);
+short3 __ovld __cnfn convert_short3_rtz(half3);
+short3 __ovld __cnfn convert_short3_sat(half3);
+short3 __ovld __cnfn convert_short3_sat_rte(half3);
+short3 __ovld __cnfn convert_short3_sat_rtp(half3);
+short3 __ovld __cnfn convert_short3_sat_rtn(half3);
+short3 __ovld __cnfn convert_short3_sat_rtz(half3);
+short4 __ovld __cnfn convert_short4(half4);
+short4 __ovld __cnfn convert_short4_rte(half4);
+short4 __ovld __cnfn convert_short4_rtp(half4);
+short4 __ovld __cnfn convert_short4_rtn(half4);
+short4 __ovld __cnfn convert_short4_rtz(half4);
+short4 __ovld __cnfn convert_short4_sat(half4);
+short4 __ovld __cnfn convert_short4_sat_rte(half4);
+short4 __ovld __cnfn convert_short4_sat_rtp(half4);
+short4 __ovld __cnfn convert_short4_sat_rtn(half4);
+short4 __ovld __cnfn convert_short4_sat_rtz(half4);
+short8 __ovld __cnfn convert_short8(half8);
+short8 __ovld __cnfn convert_short8_rte(half8);
+short8 __ovld __cnfn convert_short8_rtp(half8);
+short8 __ovld __cnfn convert_short8_rtn(half8);
+short8 __ovld __cnfn convert_short8_rtz(half8);
+short8 __ovld __cnfn convert_short8_sat(half8);
+short8 __ovld __cnfn convert_short8_sat_rte(half8);
+short8 __ovld __cnfn convert_short8_sat_rtp(half8);
+short8 __ovld __cnfn convert_short8_sat_rtn(half8);
+short8 __ovld __cnfn convert_short8_sat_rtz(half8);
+short16 __ovld __cnfn convert_short16(half16);
+short16 __ovld __cnfn convert_short16_rte(half16);
+short16 __ovld __cnfn convert_short16_rtp(half16);
+short16 __ovld __cnfn convert_short16_rtn(half16);
+short16 __ovld __cnfn convert_short16_rtz(half16);
+short16 __ovld __cnfn convert_short16_sat(half16);
+short16 __ovld __cnfn convert_short16_sat_rte(half16);
+short16 __ovld __cnfn convert_short16_sat_rtp(half16);
+short16 __ovld __cnfn convert_short16_sat_rtn(half16);
+short16 __ovld __cnfn convert_short16_sat_rtz(half16);
+int __ovld __cnfn convert_int(half);
+int __ovld __cnfn convert_int_rte(half);
+int __ovld __cnfn convert_int_rtp(half);
+int __ovld __cnfn convert_int_rtn(half);
+int __ovld __cnfn convert_int_rtz(half);
+int __ovld __cnfn convert_int_sat(half);
+int __ovld __cnfn convert_int_sat_rte(half);
+int __ovld __cnfn convert_int_sat_rtp(half);
+int __ovld __cnfn convert_int_sat_rtn(half);
+int __ovld __cnfn convert_int_sat_rtz(half);
+int2 __ovld __cnfn convert_int2(half2);
+int2 __ovld __cnfn convert_int2_rte(half2);
+int2 __ovld __cnfn convert_int2_rtp(half2);
+int2 __ovld __cnfn convert_int2_rtn(half2);
+int2 __ovld __cnfn convert_int2_rtz(half2);
+int2 __ovld __cnfn convert_int2_sat(half2);
+int2 __ovld __cnfn convert_int2_sat_rte(half2);
+int2 __ovld __cnfn convert_int2_sat_rtp(half2);
+int2 __ovld __cnfn convert_int2_sat_rtn(half2);
+int2 __ovld __cnfn convert_int2_sat_rtz(half2);
+int3 __ovld __cnfn convert_int3(half3);
+int3 __ovld __cnfn convert_int3_rte(half3);
+int3 __ovld __cnfn convert_int3_rtp(half3);
+int3 __ovld __cnfn convert_int3_rtn(half3);
+int3 __ovld __cnfn convert_int3_rtz(half3);
+int3 __ovld __cnfn convert_int3_sat(half3);
+int3 __ovld __cnfn convert_int3_sat_rte(half3);
+int3 __ovld __cnfn convert_int3_sat_rtp(half3);
+int3 __ovld __cnfn convert_int3_sat_rtn(half3);
+int3 __ovld __cnfn convert_int3_sat_rtz(half3);
+int4 __ovld __cnfn convert_int4(half4);
+int4 __ovld __cnfn convert_int4_rte(half4);
+int4 __ovld __cnfn convert_int4_rtp(half4);
+int4 __ovld __cnfn convert_int4_rtn(half4);
+int4 __ovld __cnfn convert_int4_rtz(half4);
+int4 __ovld __cnfn convert_int4_sat(half4);
+int4 __ovld __cnfn convert_int4_sat_rte(half4);
+int4 __ovld __cnfn convert_int4_sat_rtp(half4);
+int4 __ovld __cnfn convert_int4_sat_rtn(half4);
+int4 __ovld __cnfn convert_int4_sat_rtz(half4);
+int8 __ovld __cnfn convert_int8(half8);
+int8 __ovld __cnfn convert_int8_rte(half8);
+int8 __ovld __cnfn convert_int8_rtp(half8);
+int8 __ovld __cnfn convert_int8_rtn(half8);
+int8 __ovld __cnfn convert_int8_rtz(half8);
+int8 __ovld __cnfn convert_int8_sat(half8);
+int8 __ovld __cnfn convert_int8_sat_rte(half8);
+int8 __ovld __cnfn convert_int8_sat_rtp(half8);
+int8 __ovld __cnfn convert_int8_sat_rtn(half8);
+int8 __ovld __cnfn convert_int8_sat_rtz(half8);
+int16 __ovld __cnfn convert_int16(half16);
+int16 __ovld __cnfn convert_int16_rte(half16);
+int16 __ovld __cnfn convert_int16_rtp(half16);
+int16 __ovld __cnfn convert_int16_rtn(half16);
+int16 __ovld __cnfn convert_int16_rtz(half16);
+int16 __ovld __cnfn convert_int16_sat(half16);
+int16 __ovld __cnfn convert_int16_sat_rte(half16);
+int16 __ovld __cnfn convert_int16_sat_rtp(half16);
+int16 __ovld __cnfn convert_int16_sat_rtn(half16);
+int16 __ovld __cnfn convert_int16_sat_rtz(half16);
+long __ovld __cnfn convert_long(half);
+long __ovld __cnfn convert_long_rte(half);
+long __ovld __cnfn convert_long_rtp(half);
+long __ovld __cnfn convert_long_rtn(half);
+long __ovld __cnfn convert_long_rtz(half);
+long __ovld __cnfn convert_long_sat(half);
+long __ovld __cnfn convert_long_sat_rte(half);
+long __ovld __cnfn convert_long_sat_rtp(half);
+long __ovld __cnfn convert_long_sat_rtn(half);
+long __ovld __cnfn convert_long_sat_rtz(half);
+long2 __ovld __cnfn convert_long2(half2);
+long2 __ovld __cnfn convert_long2_rte(half2);
+long2 __ovld __cnfn convert_long2_rtp(half2);
+long2 __ovld __cnfn convert_long2_rtn(half2);
+long2 __ovld __cnfn convert_long2_rtz(half2);
+long2 __ovld __cnfn convert_long2_sat(half2);
+long2 __ovld __cnfn convert_long2_sat_rte(half2);
+long2 __ovld __cnfn convert_long2_sat_rtp(half2);
+long2 __ovld __cnfn convert_long2_sat_rtn(half2);
+long2 __ovld __cnfn convert_long2_sat_rtz(half2);
+long3 __ovld __cnfn convert_long3(half3);
+long3 __ovld __cnfn convert_long3_rte(half3);
+long3 __ovld __cnfn convert_long3_rtp(half3);
+long3 __ovld __cnfn convert_long3_rtn(half3);
+long3 __ovld __cnfn convert_long3_rtz(half3);
+long3 __ovld __cnfn convert_long3_sat(half3);
+long3 __ovld __cnfn convert_long3_sat_rte(half3);
+long3 __ovld __cnfn convert_long3_sat_rtp(half3);
+long3 __ovld __cnfn convert_long3_sat_rtn(half3);
+long3 __ovld __cnfn convert_long3_sat_rtz(half3);
+long4 __ovld __cnfn convert_long4(half4);
+long4 __ovld __cnfn convert_long4_rte(half4);
+long4 __ovld __cnfn convert_long4_rtp(half4);
+long4 __ovld __cnfn convert_long4_rtn(half4);
+long4 __ovld __cnfn convert_long4_rtz(half4);
+long4 __ovld __cnfn convert_long4_sat(half4);
+long4 __ovld __cnfn convert_long4_sat_rte(half4);
+long4 __ovld __cnfn convert_long4_sat_rtp(half4);
+long4 __ovld __cnfn convert_long4_sat_rtn(half4);
+long4 __ovld __cnfn convert_long4_sat_rtz(half4);
+long8 __ovld __cnfn convert_long8(half8);
+long8 __ovld __cnfn convert_long8_rte(half8);
+long8 __ovld __cnfn convert_long8_rtp(half8);
+long8 __ovld __cnfn convert_long8_rtn(half8);
+long8 __ovld __cnfn convert_long8_rtz(half8);
+long8 __ovld __cnfn convert_long8_sat(half8);
+long8 __ovld __cnfn convert_long8_sat_rte(half8);
+long8 __ovld __cnfn convert_long8_sat_rtp(half8);
+long8 __ovld __cnfn convert_long8_sat_rtn(half8);
+long8 __ovld __cnfn convert_long8_sat_rtz(half8);
+long16 __ovld __cnfn convert_long16(half16);
+long16 __ovld __cnfn convert_long16_rte(half16);
+long16 __ovld __cnfn convert_long16_rtp(half16);
+long16 __ovld __cnfn convert_long16_rtn(half16);
+long16 __ovld __cnfn convert_long16_rtz(half16);
+long16 __ovld __cnfn convert_long16_sat(half16);
+long16 __ovld __cnfn convert_long16_sat_rte(half16);
+long16 __ovld __cnfn convert_long16_sat_rtp(half16);
+long16 __ovld __cnfn convert_long16_sat_rtn(half16);
+long16 __ovld __cnfn convert_long16_sat_rtz(half16);
+float __ovld __cnfn convert_float(half);
+float __ovld __cnfn convert_float_rte(half);
+float __ovld __cnfn convert_float_rtp(half);
+float __ovld __cnfn convert_float_rtn(half);
+float __ovld __cnfn convert_float_rtz(half);
+float2 __ovld __cnfn convert_float2(half2);
+float2 __ovld __cnfn convert_float2_rte(half2);
+float2 __ovld __cnfn convert_float2_rtp(half2);
+float2 __ovld __cnfn convert_float2_rtn(half2);
+float2 __ovld __cnfn convert_float2_rtz(half2);
+float3 __ovld __cnfn convert_float3(half3);
+float3 __ovld __cnfn convert_float3_rte(half3);
+float3 __ovld __cnfn convert_float3_rtp(half3);
+float3 __ovld __cnfn convert_float3_rtn(half3);
+float3 __ovld __cnfn convert_float3_rtz(half3);
+float4 __ovld __cnfn convert_float4(half4);
+float4 __ovld __cnfn convert_float4_rte(half4);
+float4 __ovld __cnfn convert_float4_rtp(half4);
+float4 __ovld __cnfn convert_float4_rtn(half4);
+float4 __ovld __cnfn convert_float4_rtz(half4);
+float8 __ovld __cnfn convert_float8(half8);
+float8 __ovld __cnfn convert_float8_rte(half8);
+float8 __ovld __cnfn convert_float8_rtp(half8);
+float8 __ovld __cnfn convert_float8_rtn(half8);
+float8 __ovld __cnfn convert_float8_rtz(half8);
+float16 __ovld __cnfn convert_float16(half16);
+float16 __ovld __cnfn convert_float16_rte(half16);
+float16 __ovld __cnfn convert_float16_rtp(half16);
+float16 __ovld __cnfn convert_float16_rtn(half16);
+float16 __ovld __cnfn convert_float16_rtz(half16);
+
+// Convert non-double types to half types.
+half __ovld __cnfn convert_half(uchar);
+half __ovld __cnfn convert_half(ushort);
+half __ovld __cnfn convert_half(uint);
+half __ovld __cnfn convert_half(ulong);
+half __ovld __cnfn convert_half(char);
+half __ovld __cnfn convert_half(short);
+half __ovld __cnfn convert_half(int);
+half __ovld __cnfn convert_half(long);
+half __ovld __cnfn convert_half(float);
+half __ovld __cnfn convert_half(half);
+half __ovld __cnfn convert_half_rte(uchar);
+half __ovld __cnfn convert_half_rte(ushort);
+half __ovld __cnfn convert_half_rte(uint);
+half __ovld __cnfn convert_half_rte(ulong);
+half __ovld __cnfn convert_half_rte(char);
+half __ovld __cnfn convert_half_rte(short);
+half __ovld __cnfn convert_half_rte(int);
+half __ovld __cnfn convert_half_rte(long);
+half __ovld __cnfn convert_half_rte(float);
+half __ovld __cnfn convert_half_rte(half);
+half __ovld __cnfn convert_half_rtp(uchar);
+half __ovld __cnfn convert_half_rtp(ushort);
+half __ovld __cnfn convert_half_rtp(uint);
+half __ovld __cnfn convert_half_rtp(ulong);
+half __ovld __cnfn convert_half_rtp(char);
+half __ovld __cnfn convert_half_rtp(short);
+half __ovld __cnfn convert_half_rtp(int);
+half __ovld __cnfn convert_half_rtp(long);
+half __ovld __cnfn convert_half_rtp(float);
+half __ovld __cnfn convert_half_rtp(half);
+half __ovld __cnfn convert_half_rtn(uchar);
+half __ovld __cnfn convert_half_rtn(ushort);
+half __ovld __cnfn convert_half_rtn(uint);
+half __ovld __cnfn convert_half_rtn(ulong);
+half __ovld __cnfn convert_half_rtn(char);
+half __ovld __cnfn convert_half_rtn(short);
+half __ovld __cnfn convert_half_rtn(int);
+half __ovld __cnfn convert_half_rtn(long);
+half __ovld __cnfn convert_half_rtn(float);
+half __ovld __cnfn convert_half_rtn(half);
+half __ovld __cnfn convert_half_rtz(uchar);
+half __ovld __cnfn convert_half_rtz(ushort);
+half __ovld __cnfn convert_half_rtz(uint);
+half __ovld __cnfn convert_half_rtz(ulong);
+half __ovld __cnfn convert_half_rtz(char);
+half __ovld __cnfn convert_half_rtz(short);
+half __ovld __cnfn convert_half_rtz(int);
+half __ovld __cnfn convert_half_rtz(long);
+half __ovld __cnfn convert_half_rtz(float);
+half __ovld __cnfn convert_half_rtz(half);
+half2 __ovld __cnfn convert_half2(char2);
+half2 __ovld __cnfn convert_half2(uchar2);
+half2 __ovld __cnfn convert_half2(short2);
+half2 __ovld __cnfn convert_half2(ushort2);
+half2 __ovld __cnfn convert_half2(int2);
+half2 __ovld __cnfn convert_half2(uint2);
+half2 __ovld __cnfn convert_half2(long2);
+half2 __ovld __cnfn convert_half2(ulong2);
+half2 __ovld __cnfn convert_half2(float2);
+half2 __ovld __cnfn convert_half2(half2);
+half2 __ovld __cnfn convert_half2_rte(char2);
+half2 __ovld __cnfn convert_half2_rte(uchar2);
+half2 __ovld __cnfn convert_half2_rte(short2);
+half2 __ovld __cnfn convert_half2_rte(ushort2);
+half2 __ovld __cnfn convert_half2_rte(int2);
+half2 __ovld __cnfn convert_half2_rte(uint2);
+half2 __ovld __cnfn convert_half2_rte(long2);
+half2 __ovld __cnfn convert_half2_rte(ulong2);
+half2 __ovld __cnfn convert_half2_rte(float2);
+half2 __ovld __cnfn convert_half2_rte(half2);
+half2 __ovld __cnfn convert_half2_rtp(char2);
+half2 __ovld __cnfn convert_half2_rtp(uchar2);
+half2 __ovld __cnfn convert_half2_rtp(short2);
+half2 __ovld __cnfn convert_half2_rtp(ushort2);
+half2 __ovld __cnfn convert_half2_rtp(int2);
+half2 __ovld __cnfn convert_half2_rtp(uint2);
+half2 __ovld __cnfn convert_half2_rtp(long2);
+half2 __ovld __cnfn convert_half2_rtp(ulong2);
+half2 __ovld __cnfn convert_half2_rtp(float2);
+half2 __ovld __cnfn convert_half2_rtp(half2);
+half2 __ovld __cnfn convert_half2_rtn(char2);
+half2 __ovld __cnfn convert_half2_rtn(uchar2);
+half2 __ovld __cnfn convert_half2_rtn(short2);
+half2 __ovld __cnfn convert_half2_rtn(ushort2);
+half2 __ovld __cnfn convert_half2_rtn(int2);
+half2 __ovld __cnfn convert_half2_rtn(uint2);
+half2 __ovld __cnfn convert_half2_rtn(long2);
+half2 __ovld __cnfn convert_half2_rtn(ulong2);
+half2 __ovld __cnfn convert_half2_rtn(float2);
+half2 __ovld __cnfn convert_half2_rtn(half2);
+half2 __ovld __cnfn convert_half2_rtz(char2);
+half2 __ovld __cnfn convert_half2_rtz(uchar2);
+half2 __ovld __cnfn convert_half2_rtz(short2);
+half2 __ovld __cnfn convert_half2_rtz(ushort2);
+half2 __ovld __cnfn convert_half2_rtz(int2);
+half2 __ovld __cnfn convert_half2_rtz(uint2);
+half2 __ovld __cnfn convert_half2_rtz(long2);
+half2 __ovld __cnfn convert_half2_rtz(ulong2);
+half2 __ovld __cnfn convert_half2_rtz(float2);
+half2 __ovld __cnfn convert_half2_rtz(half2);
+half3 __ovld __cnfn convert_half3(char3);
+half3 __ovld __cnfn convert_half3(uchar3);
+half3 __ovld __cnfn convert_half3(short3);
+half3 __ovld __cnfn convert_half3(ushort3);
+half3 __ovld __cnfn convert_half3(int3);
+half3 __ovld __cnfn convert_half3(uint3);
+half3 __ovld __cnfn convert_half3(long3);
+half3 __ovld __cnfn convert_half3(ulong3);
+half3 __ovld __cnfn convert_half3(float3);
+half3 __ovld __cnfn convert_half3(half3);
+half3 __ovld __cnfn convert_half3_rte(char3);
+half3 __ovld __cnfn convert_half3_rte(uchar3);
+half3 __ovld __cnfn convert_half3_rte(short3);
+half3 __ovld __cnfn convert_half3_rte(ushort3);
+half3 __ovld __cnfn convert_half3_rte(int3);
+half3 __ovld __cnfn convert_half3_rte(uint3);
+half3 __ovld __cnfn convert_half3_rte(long3);
+half3 __ovld __cnfn convert_half3_rte(ulong3);
+half3 __ovld __cnfn convert_half3_rte(float3);
+half3 __ovld __cnfn convert_half3_rte(half3);
+half3 __ovld __cnfn convert_half3_rtp(char3);
+half3 __ovld __cnfn convert_half3_rtp(uchar3);
+half3 __ovld __cnfn convert_half3_rtp(short3);
+half3 __ovld __cnfn convert_half3_rtp(ushort3);
+half3 __ovld __cnfn convert_half3_rtp(int3);
+half3 __ovld __cnfn convert_half3_rtp(uint3);
+half3 __ovld __cnfn convert_half3_rtp(long3);
+half3 __ovld __cnfn convert_half3_rtp(ulong3);
+half3 __ovld __cnfn convert_half3_rtp(float3);
+half3 __ovld __cnfn convert_half3_rtp(half3);
+half3 __ovld __cnfn convert_half3_rtn(char3);
+half3 __ovld __cnfn convert_half3_rtn(uchar3);
+half3 __ovld __cnfn convert_half3_rtn(short3);
+half3 __ovld __cnfn convert_half3_rtn(ushort3);
+half3 __ovld __cnfn convert_half3_rtn(int3);
+half3 __ovld __cnfn convert_half3_rtn(uint3);
+half3 __ovld __cnfn convert_half3_rtn(long3);
+half3 __ovld __cnfn convert_half3_rtn(ulong3);
+half3 __ovld __cnfn convert_half3_rtn(float3);
+half3 __ovld __cnfn convert_half3_rtn(half3);
+half3 __ovld __cnfn convert_half3_rtz(char3);
+half3 __ovld __cnfn convert_half3_rtz(uchar3);
+half3 __ovld __cnfn convert_half3_rtz(short3);
+half3 __ovld __cnfn convert_half3_rtz(ushort3);
+half3 __ovld __cnfn convert_half3_rtz(int3);
+half3 __ovld __cnfn convert_half3_rtz(uint3);
+half3 __ovld __cnfn convert_half3_rtz(long3);
+half3 __ovld __cnfn convert_half3_rtz(ulong3);
+half3 __ovld __cnfn convert_half3_rtz(float3);
+half3 __ovld __cnfn convert_half3_rtz(half3);
+half4 __ovld __cnfn convert_half4(char4);
+half4 __ovld __cnfn convert_half4(uchar4);
+half4 __ovld __cnfn convert_half4(short4);
+half4 __ovld __cnfn convert_half4(ushort4);
+half4 __ovld __cnfn convert_half4(int4);
+half4 __ovld __cnfn convert_half4(uint4);
+half4 __ovld __cnfn convert_half4(long4);
+half4 __ovld __cnfn convert_half4(ulong4);
+half4 __ovld __cnfn convert_half4(float4);
+half4 __ovld __cnfn convert_half4(half4);
+half4 __ovld __cnfn convert_half4_rte(char4);
+half4 __ovld __cnfn convert_half4_rte(uchar4);
+half4 __ovld __cnfn convert_half4_rte(short4);
+half4 __ovld __cnfn convert_half4_rte(ushort4);
+half4 __ovld __cnfn convert_half4_rte(int4);
+half4 __ovld __cnfn convert_half4_rte(uint4);
+half4 __ovld __cnfn convert_half4_rte(long4);
+half4 __ovld __cnfn convert_half4_rte(ulong4);
+half4 __ovld __cnfn convert_half4_rte(float4);
+half4 __ovld __cnfn convert_half4_rte(half4);
+half4 __ovld __cnfn convert_half4_rtp(char4);
+half4 __ovld __cnfn convert_half4_rtp(uchar4);
+half4 __ovld __cnfn convert_half4_rtp(short4);
+half4 __ovld __cnfn convert_half4_rtp(ushort4);
+half4 __ovld __cnfn convert_half4_rtp(int4);
+half4 __ovld __cnfn convert_half4_rtp(uint4);
+half4 __ovld __cnfn convert_half4_rtp(long4);
+half4 __ovld __cnfn convert_half4_rtp(ulong4);
+half4 __ovld __cnfn convert_half4_rtp(float4);
+half4 __ovld __cnfn convert_half4_rtp(half4);
+half4 __ovld __cnfn convert_half4_rtn(char4);
+half4 __ovld __cnfn convert_half4_rtn(uchar4);
+half4 __ovld __cnfn convert_half4_rtn(short4);
+half4 __ovld __cnfn convert_half4_rtn(ushort4);
+half4 __ovld __cnfn convert_half4_rtn(int4);
+half4 __ovld __cnfn convert_half4_rtn(uint4);
+half4 __ovld __cnfn convert_half4_rtn(long4);
+half4 __ovld __cnfn convert_half4_rtn(ulong4);
+half4 __ovld __cnfn convert_half4_rtn(float4);
+half4 __ovld __cnfn convert_half4_rtn(half4);
+half4 __ovld __cnfn convert_half4_rtz(char4);
+half4 __ovld __cnfn convert_half4_rtz(uchar4);
+half4 __ovld __cnfn convert_half4_rtz(short4);
+half4 __ovld __cnfn convert_half4_rtz(ushort4);
+half4 __ovld __cnfn convert_half4_rtz(int4);
+half4 __ovld __cnfn convert_half4_rtz(uint4);
+half4 __ovld __cnfn convert_half4_rtz(long4);
+half4 __ovld __cnfn convert_half4_rtz(ulong4);
+half4 __ovld __cnfn convert_half4_rtz(float4);
+half4 __ovld __cnfn convert_half4_rtz(half4);
+half8 __ovld __cnfn convert_half8(char8);
+half8 __ovld __cnfn convert_half8(uchar8);
+half8 __ovld __cnfn convert_half8(short8);
+half8 __ovld __cnfn convert_half8(ushort8);
+half8 __ovld __cnfn convert_half8(int8);
+half8 __ovld __cnfn convert_half8(uint8);
+half8 __ovld __cnfn convert_half8(long8);
+half8 __ovld __cnfn convert_half8(ulong8);
+half8 __ovld __cnfn convert_half8(float8);
+half8 __ovld __cnfn convert_half8(half8);
+half8 __ovld __cnfn convert_half8_rte(char8);
+half8 __ovld __cnfn convert_half8_rte(uchar8);
+half8 __ovld __cnfn convert_half8_rte(short8);
+half8 __ovld __cnfn convert_half8_rte(ushort8);
+half8 __ovld __cnfn convert_half8_rte(int8);
+half8 __ovld __cnfn convert_half8_rte(uint8);
+half8 __ovld __cnfn convert_half8_rte(long8);
+half8 __ovld __cnfn convert_half8_rte(ulong8);
+half8 __ovld __cnfn convert_half8_rte(float8);
+half8 __ovld __cnfn convert_half8_rte(half8);
+half8 __ovld __cnfn convert_half8_rtp(char8);
+half8 __ovld __cnfn convert_half8_rtp(uchar8);
+half8 __ovld __cnfn convert_half8_rtp(short8);
+half8 __ovld __cnfn convert_half8_rtp(ushort8);
+half8 __ovld __cnfn convert_half8_rtp(int8);
+half8 __ovld __cnfn convert_half8_rtp(uint8);
+half8 __ovld __cnfn convert_half8_rtp(long8);
+half8 __ovld __cnfn convert_half8_rtp(ulong8);
+half8 __ovld __cnfn convert_half8_rtp(float8);
+half8 __ovld __cnfn convert_half8_rtp(half8);
+half8 __ovld __cnfn convert_half8_rtn(char8);
+half8 __ovld __cnfn convert_half8_rtn(uchar8);
+half8 __ovld __cnfn convert_half8_rtn(short8);
+half8 __ovld __cnfn convert_half8_rtn(ushort8);
+half8 __ovld __cnfn convert_half8_rtn(int8);
+half8 __ovld __cnfn convert_half8_rtn(uint8);
+half8 __ovld __cnfn convert_half8_rtn(long8);
+half8 __ovld __cnfn convert_half8_rtn(ulong8);
+half8 __ovld __cnfn convert_half8_rtn(float8);
+half8 __ovld __cnfn convert_half8_rtn(half8);
+half8 __ovld __cnfn convert_half8_rtz(char8);
+half8 __ovld __cnfn convert_half8_rtz(uchar8);
+half8 __ovld __cnfn convert_half8_rtz(short8);
+half8 __ovld __cnfn convert_half8_rtz(ushort8);
+half8 __ovld __cnfn convert_half8_rtz(int8);
+half8 __ovld __cnfn convert_half8_rtz(uint8);
+half8 __ovld __cnfn convert_half8_rtz(long8);
+half8 __ovld __cnfn convert_half8_rtz(ulong8);
+half8 __ovld __cnfn convert_half8_rtz(float8);
+half8 __ovld __cnfn convert_half8_rtz(half8);
+half16 __ovld __cnfn convert_half16(char16);
+half16 __ovld __cnfn convert_half16(uchar16);
+half16 __ovld __cnfn convert_half16(short16);
+half16 __ovld __cnfn convert_half16(ushort16);
+half16 __ovld __cnfn convert_half16(int16);
+half16 __ovld __cnfn convert_half16(uint16);
+half16 __ovld __cnfn convert_half16(long16);
+half16 __ovld __cnfn convert_half16(ulong16);
+half16 __ovld __cnfn convert_half16(float16);
+half16 __ovld __cnfn convert_half16(half16);
+half16 __ovld __cnfn convert_half16_rte(char16);
+half16 __ovld __cnfn convert_half16_rte(uchar16);
+half16 __ovld __cnfn convert_half16_rte(short16);
+half16 __ovld __cnfn convert_half16_rte(ushort16);
+half16 __ovld __cnfn convert_half16_rte(int16);
+half16 __ovld __cnfn convert_half16_rte(uint16);
+half16 __ovld __cnfn convert_half16_rte(long16);
+half16 __ovld __cnfn convert_half16_rte(ulong16);
+half16 __ovld __cnfn convert_half16_rte(float16);
+half16 __ovld __cnfn convert_half16_rte(half16);
+half16 __ovld __cnfn convert_half16_rtp(char16);
+half16 __ovld __cnfn convert_half16_rtp(uchar16);
+half16 __ovld __cnfn convert_half16_rtp(short16);
+half16 __ovld __cnfn convert_half16_rtp(ushort16);
+half16 __ovld __cnfn convert_half16_rtp(int16);
+half16 __ovld __cnfn convert_half16_rtp(uint16);
+half16 __ovld __cnfn convert_half16_rtp(long16);
+half16 __ovld __cnfn convert_half16_rtp(ulong16);
+half16 __ovld __cnfn convert_half16_rtp(float16);
+half16 __ovld __cnfn convert_half16_rtp(half16);
+half16 __ovld __cnfn convert_half16_rtn(char16);
+half16 __ovld __cnfn convert_half16_rtn(uchar16);
+half16 __ovld __cnfn convert_half16_rtn(short16);
+half16 __ovld __cnfn convert_half16_rtn(ushort16);
+half16 __ovld __cnfn convert_half16_rtn(int16);
+half16 __ovld __cnfn convert_half16_rtn(uint16);
+half16 __ovld __cnfn convert_half16_rtn(long16);
+half16 __ovld __cnfn convert_half16_rtn(ulong16);
+half16 __ovld __cnfn convert_half16_rtn(float16);
+half16 __ovld __cnfn convert_half16_rtn(half16);
+half16 __ovld __cnfn convert_half16_rtz(char16);
+half16 __ovld __cnfn convert_half16_rtz(uchar16);
+half16 __ovld __cnfn convert_half16_rtz(short16);
+half16 __ovld __cnfn convert_half16_rtz(ushort16);
+half16 __ovld __cnfn convert_half16_rtz(int16);
+half16 __ovld __cnfn convert_half16_rtz(uint16);
+half16 __ovld __cnfn convert_half16_rtz(long16);
+half16 __ovld __cnfn convert_half16_rtz(ulong16);
+half16 __ovld __cnfn convert_half16_rtz(float16);
+half16 __ovld __cnfn convert_half16_rtz(half16);
+
+// Convert half types to double types.
+#ifdef cl_khr_fp64
+double __ovld __cnfn convert_double(half);
+double __ovld __cnfn convert_double_rte(half);
+double __ovld __cnfn convert_double_rtp(half);
+double __ovld __cnfn convert_double_rtn(half);
+double __ovld __cnfn convert_double_rtz(half);
+double2 __ovld __cnfn convert_double2(half2);
+double2 __ovld __cnfn convert_double2_rte(half2);
+double2 __ovld __cnfn convert_double2_rtp(half2);
+double2 __ovld __cnfn convert_double2_rtn(half2);
+double2 __ovld __cnfn convert_double2_rtz(half2);
+double3 __ovld __cnfn convert_double3(half3);
+double3 __ovld __cnfn convert_double3_rte(half3);
+double3 __ovld __cnfn convert_double3_rtp(half3);
+double3 __ovld __cnfn convert_double3_rtn(half3);
+double3 __ovld __cnfn convert_double3_rtz(half3);
+double4 __ovld __cnfn convert_double4(half4);
+double4 __ovld __cnfn convert_double4_rte(half4);
+double4 __ovld __cnfn convert_double4_rtp(half4);
+double4 __ovld __cnfn convert_double4_rtn(half4);
+double4 __ovld __cnfn convert_double4_rtz(half4);
+double8 __ovld __cnfn convert_double8(half8);
+double8 __ovld __cnfn convert_double8_rte(half8);
+double8 __ovld __cnfn convert_double8_rtp(half8);
+double8 __ovld __cnfn convert_double8_rtn(half8);
+double8 __ovld __cnfn convert_double8_rtz(half8);
+double16 __ovld __cnfn convert_double16(half16);
+double16 __ovld __cnfn convert_double16_rte(half16);
+double16 __ovld __cnfn convert_double16_rtp(half16);
+double16 __ovld __cnfn convert_double16_rtn(half16);
+double16 __ovld __cnfn convert_double16_rtz(half16);
+
+// Convert double types to half types.
+half __ovld __cnfn convert_half(double);
+half __ovld __cnfn convert_half_rte(double);
+half __ovld __cnfn convert_half_rtp(double);
+half __ovld __cnfn convert_half_rtn(double);
+half __ovld __cnfn convert_half_rtz(double);
+half2 __ovld __cnfn convert_half2(double2);
+half2 __ovld __cnfn convert_half2_rte(double2);
+half2 __ovld __cnfn convert_half2_rtp(double2);
+half2 __ovld __cnfn convert_half2_rtn(double2);
+half2 __ovld __cnfn convert_half2_rtz(double2);
+half3 __ovld __cnfn convert_half3(double3);
+half3 __ovld __cnfn convert_half3_rte(double3);
+half3 __ovld __cnfn convert_half3_rtp(double3);
+half3 __ovld __cnfn convert_half3_rtn(double3);
+half3 __ovld __cnfn convert_half3_rtz(double3);
+half4 __ovld __cnfn convert_half4(double4);
+half4 __ovld __cnfn convert_half4_rte(double4);
+half4 __ovld __cnfn convert_half4_rtp(double4);
+half4 __ovld __cnfn convert_half4_rtn(double4);
+half4 __ovld __cnfn convert_half4_rtz(double4);
+half8 __ovld __cnfn convert_half8(double8);
+half8 __ovld __cnfn convert_half8_rte(double8);
+half8 __ovld __cnfn convert_half8_rtp(double8);
+half8 __ovld __cnfn convert_half8_rtn(double8);
+half8 __ovld __cnfn convert_half8_rtz(double8);
+half16 __ovld __cnfn convert_half16(double16);
+half16 __ovld __cnfn convert_half16_rte(double16);
+half16 __ovld __cnfn convert_half16_rtp(double16);
+half16 __ovld __cnfn convert_half16_rtn(double16);
+half16 __ovld __cnfn convert_half16_rtz(double16);
+#endif //cl_khr_fp64
+
+#endif // cl_khr_fp16
+
+/**
+ * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
+ * Reinterprets a data type as another data type of the same size
+ */
+char __ovld __cnfn as_char(char);
+char __ovld __cnfn as_char(uchar);
+
+char2 __ovld __cnfn as_char2(char2);
+char2 __ovld __cnfn as_char2(uchar2);
+char2 __ovld __cnfn as_char2(short);
+char2 __ovld __cnfn as_char2(ushort);
+
+char3 __ovld __cnfn as_char3(char3);
+char3 __ovld __cnfn as_char3(char4);
+char3 __ovld __cnfn as_char3(uchar3);
+char3 __ovld __cnfn as_char3(uchar4);
+char3 __ovld __cnfn as_char3(short2);
+char3 __ovld __cnfn as_char3(ushort2);
+char3 __ovld __cnfn as_char3(int);
+char3 __ovld __cnfn as_char3(uint);
+char3 __ovld __cnfn as_char3(float);
+
+char4 __ovld __cnfn as_char4(char3);
+char4 __ovld __cnfn as_char4(char4);
+char4 __ovld __cnfn as_char4(uchar3);
+char4 __ovld __cnfn as_char4(uchar4);
+char4 __ovld __cnfn as_char4(short2);
+char4 __ovld __cnfn as_char4(ushort2);
+char4 __ovld __cnfn as_char4(int);
+char4 __ovld __cnfn as_char4(uint);
+char4 __ovld __cnfn as_char4(float);
+
+char8 __ovld __cnfn as_char8(char8);
+char8 __ovld __cnfn as_char8(uchar8);
+char8 __ovld __cnfn as_char8(short3);
+char8 __ovld __cnfn as_char8(short4);
+char8 __ovld __cnfn as_char8(ushort3);
+char8 __ovld __cnfn as_char8(ushort4);
+char8 __ovld __cnfn as_char8(int2);
+char8 __ovld __cnfn as_char8(uint2);
+char8 __ovld __cnfn as_char8(long);
+char8 __ovld __cnfn as_char8(ulong);
+char8 __ovld __cnfn as_char8(float2);
+
+char16 __ovld __cnfn as_char16(char16);
+char16 __ovld __cnfn as_char16(uchar16);
+char16 __ovld __cnfn as_char16(short8);
+char16 __ovld __cnfn as_char16(ushort8);
+char16 __ovld __cnfn as_char16(int3);
+char16 __ovld __cnfn as_char16(int4);
+char16 __ovld __cnfn as_char16(uint3);
+char16 __ovld __cnfn as_char16(uint4);
+char16 __ovld __cnfn as_char16(long2);
+char16 __ovld __cnfn as_char16(ulong2);
+char16 __ovld __cnfn as_char16(float3);
+char16 __ovld __cnfn as_char16(float4);
+
+uchar __ovld __cnfn as_uchar(char);
+uchar __ovld __cnfn as_uchar(uchar);
+
+uchar2 __ovld __cnfn as_uchar2(char2);
+uchar2 __ovld __cnfn as_uchar2(uchar2);
+uchar2 __ovld __cnfn as_uchar2(short);
+uchar2 __ovld __cnfn as_uchar2(ushort);
+
+uchar3 __ovld __cnfn as_uchar3(char3);
+uchar3 __ovld __cnfn as_uchar3(char4);
+uchar3 __ovld __cnfn as_uchar3(uchar3);
+uchar3 __ovld __cnfn as_uchar3(uchar4);
+uchar3 __ovld __cnfn as_uchar3(short2);
+uchar3 __ovld __cnfn as_uchar3(ushort2);
+uchar3 __ovld __cnfn as_uchar3(int);
+uchar3 __ovld __cnfn as_uchar3(uint);
+uchar3 __ovld __cnfn as_uchar3(float);
+
+uchar4 __ovld __cnfn as_uchar4(char3);
+uchar4 __ovld __cnfn as_uchar4(char4);
+uchar4 __ovld __cnfn as_uchar4(uchar3);
+uchar4 __ovld __cnfn as_uchar4(uchar4);
+uchar4 __ovld __cnfn as_uchar4(short2);
+uchar4 __ovld __cnfn as_uchar4(ushort2);
+uchar4 __ovld __cnfn as_uchar4(int);
+uchar4 __ovld __cnfn as_uchar4(uint);
+uchar4 __ovld __cnfn as_uchar4(float);
+
+uchar8 __ovld __cnfn as_uchar8(char8);
+uchar8 __ovld __cnfn as_uchar8(uchar8);
+uchar8 __ovld __cnfn as_uchar8(short3);
+uchar8 __ovld __cnfn as_uchar8(short4);
+uchar8 __ovld __cnfn as_uchar8(ushort3);
+uchar8 __ovld __cnfn as_uchar8(ushort4);
+uchar8 __ovld __cnfn as_uchar8(int2);
+uchar8 __ovld __cnfn as_uchar8(uint2);
+uchar8 __ovld __cnfn as_uchar8(long);
+uchar8 __ovld __cnfn as_uchar8(ulong);
+uchar8 __ovld __cnfn as_uchar8(float2);
+
+uchar16 __ovld __cnfn as_uchar16(char16);
+uchar16 __ovld __cnfn as_uchar16(uchar16);
+uchar16 __ovld __cnfn as_uchar16(short8);
+uchar16 __ovld __cnfn as_uchar16(ushort8);
+uchar16 __ovld __cnfn as_uchar16(int3);
+uchar16 __ovld __cnfn as_uchar16(int4);
+uchar16 __ovld __cnfn as_uchar16(uint3);
+uchar16 __ovld __cnfn as_uchar16(uint4);
+uchar16 __ovld __cnfn as_uchar16(long2);
+uchar16 __ovld __cnfn as_uchar16(ulong2);
+uchar16 __ovld __cnfn as_uchar16(float3);
+uchar16 __ovld __cnfn as_uchar16(float4);
+
+short __ovld __cnfn as_short(char2);
+short __ovld __cnfn as_short(uchar2);
+short __ovld __cnfn as_short(short);
+short __ovld __cnfn as_short(ushort);
+
+short2 __ovld __cnfn as_short2(char3);
+short2 __ovld __cnfn as_short2(char4);
+short2 __ovld __cnfn as_short2(uchar3);
+short2 __ovld __cnfn as_short2(uchar4);
+short2 __ovld __cnfn as_short2(short2);
+short2 __ovld __cnfn as_short2(ushort2);
+short2 __ovld __cnfn as_short2(int);
+short2 __ovld __cnfn as_short2(uint);
+short2 __ovld __cnfn as_short2(float);
+
+short3 __ovld __cnfn as_short3(char8);
+short3 __ovld __cnfn as_short3(uchar8);
+short3 __ovld __cnfn as_short3(short3);
+short3 __ovld __cnfn as_short3(short4);
+short3 __ovld __cnfn as_short3(ushort3);
+short3 __ovld __cnfn as_short3(ushort4);
+short3 __ovld __cnfn as_short3(int2);
+short3 __ovld __cnfn as_short3(uint2);
+short3 __ovld __cnfn as_short3(long);
+short3 __ovld __cnfn as_short3(ulong);
+short3 __ovld __cnfn as_short3(float2);
+
+short4 __ovld __cnfn as_short4(char8);
+short4 __ovld __cnfn as_short4(uchar8);
+short4 __ovld __cnfn as_short4(short3);
+short4 __ovld __cnfn as_short4(short4);
+short4 __ovld __cnfn as_short4(ushort3);
+short4 __ovld __cnfn as_short4(ushort4);
+short4 __ovld __cnfn as_short4(int2);
+short4 __ovld __cnfn as_short4(uint2);
+short4 __ovld __cnfn as_short4(long);
+short4 __ovld __cnfn as_short4(ulong);
+short4 __ovld __cnfn as_short4(float2);
+
+short8 __ovld __cnfn as_short8(char16);
+short8 __ovld __cnfn as_short8(uchar16);
+short8 __ovld __cnfn as_short8(short8);
+short8 __ovld __cnfn as_short8(ushort8);
+short8 __ovld __cnfn as_short8(int3);
+short8 __ovld __cnfn as_short8(int4);
+short8 __ovld __cnfn as_short8(uint3);
+short8 __ovld __cnfn as_short8(uint4);
+short8 __ovld __cnfn as_short8(long2);
+short8 __ovld __cnfn as_short8(ulong2);
+short8 __ovld __cnfn as_short8(float3);
+short8 __ovld __cnfn as_short8(float4);
+
+short16 __ovld __cnfn as_short16(short16);
+short16 __ovld __cnfn as_short16(ushort16);
+short16 __ovld __cnfn as_short16(int8);
+short16 __ovld __cnfn as_short16(uint8);
+short16 __ovld __cnfn as_short16(long3);
+short16 __ovld __cnfn as_short16(long4);
+short16 __ovld __cnfn as_short16(ulong3);
+short16 __ovld __cnfn as_short16(ulong4);
+short16 __ovld __cnfn as_short16(float8);
+
+ushort __ovld __cnfn as_ushort(char2);
+ushort __ovld __cnfn as_ushort(uchar2);
+ushort __ovld __cnfn as_ushort(short);
+ushort __ovld __cnfn as_ushort(ushort);
+
+ushort2 __ovld __cnfn as_ushort2(char3);
+ushort2 __ovld __cnfn as_ushort2(char4);
+ushort2 __ovld __cnfn as_ushort2(uchar3);
+ushort2 __ovld __cnfn as_ushort2(uchar4);
+ushort2 __ovld __cnfn as_ushort2(short2);
+ushort2 __ovld __cnfn as_ushort2(ushort2);
+ushort2 __ovld __cnfn as_ushort2(int);
+ushort2 __ovld __cnfn as_ushort2(uint);
+ushort2 __ovld __cnfn as_ushort2(float);
+
+ushort3 __ovld __cnfn as_ushort3(char8);
+ushort3 __ovld __cnfn as_ushort3(uchar8);
+ushort3 __ovld __cnfn as_ushort3(short3);
+ushort3 __ovld __cnfn as_ushort3(short4);
+ushort3 __ovld __cnfn as_ushort3(ushort3);
+ushort3 __ovld __cnfn as_ushort3(ushort4);
+ushort3 __ovld __cnfn as_ushort3(int2);
+ushort3 __ovld __cnfn as_ushort3(uint2);
+ushort3 __ovld __cnfn as_ushort3(long);
+ushort3 __ovld __cnfn as_ushort3(ulong);
+ushort3 __ovld __cnfn as_ushort3(float2);
+
+ushort4 __ovld __cnfn as_ushort4(char8);
+ushort4 __ovld __cnfn as_ushort4(uchar8);
+ushort4 __ovld __cnfn as_ushort4(short3);
+ushort4 __ovld __cnfn as_ushort4(short4);
+ushort4 __ovld __cnfn as_ushort4(ushort3);
+ushort4 __ovld __cnfn as_ushort4(ushort4);
+ushort4 __ovld __cnfn as_ushort4(int2);
+ushort4 __ovld __cnfn as_ushort4(uint2);
+ushort4 __ovld __cnfn as_ushort4(long);
+ushort4 __ovld __cnfn as_ushort4(ulong);
+ushort4 __ovld __cnfn as_ushort4(float2);
+
+ushort8 __ovld __cnfn as_ushort8(char16);
+ushort8 __ovld __cnfn as_ushort8(uchar16);
+ushort8 __ovld __cnfn as_ushort8(short8);
+ushort8 __ovld __cnfn as_ushort8(ushort8);
+ushort8 __ovld __cnfn as_ushort8(int3);
+ushort8 __ovld __cnfn as_ushort8(int4);
+ushort8 __ovld __cnfn as_ushort8(uint3);
+ushort8 __ovld __cnfn as_ushort8(uint4);
+ushort8 __ovld __cnfn as_ushort8(long2);
+ushort8 __ovld __cnfn as_ushort8(ulong2);
+ushort8 __ovld __cnfn as_ushort8(float3);
+ushort8 __ovld __cnfn as_ushort8(float4);
+
+ushort16 __ovld __cnfn as_ushort16(short16);
+ushort16 __ovld __cnfn as_ushort16(ushort16);
+ushort16 __ovld __cnfn as_ushort16(int8);
+ushort16 __ovld __cnfn as_ushort16(uint8);
+ushort16 __ovld __cnfn as_ushort16(long3);
+ushort16 __ovld __cnfn as_ushort16(long4);
+ushort16 __ovld __cnfn as_ushort16(ulong3);
+ushort16 __ovld __cnfn as_ushort16(ulong4);
+ushort16 __ovld __cnfn as_ushort16(float8);
+
+int __ovld __cnfn as_int(char3);
+int __ovld __cnfn as_int(char4);
+int __ovld __cnfn as_int(uchar3);
+int __ovld __cnfn as_int(uchar4);
+int __ovld __cnfn as_int(short2);
+int __ovld __cnfn as_int(ushort2);
+int __ovld __cnfn as_int(int);
+int __ovld __cnfn as_int(uint);
+int __ovld __cnfn as_int(float);
+
+int2 __ovld __cnfn as_int2(char8);
+int2 __ovld __cnfn as_int2(uchar8);
+int2 __ovld __cnfn as_int2(short3);
+int2 __ovld __cnfn as_int2(short4);
+int2 __ovld __cnfn as_int2(ushort3);
+int2 __ovld __cnfn as_int2(ushort4);
+int2 __ovld __cnfn as_int2(int2);
+int2 __ovld __cnfn as_int2(uint2);
+int2 __ovld __cnfn as_int2(long);
+int2 __ovld __cnfn as_int2(ulong);
+int2 __ovld __cnfn as_int2(float2);
+
+int3 __ovld __cnfn as_int3(char16);
+int3 __ovld __cnfn as_int3(uchar16);
+int3 __ovld __cnfn as_int3(short8);
+int3 __ovld __cnfn as_int3(ushort8);
+int3 __ovld __cnfn as_int3(int3);
+int3 __ovld __cnfn as_int3(int4);
+int3 __ovld __cnfn as_int3(uint3);
+int3 __ovld __cnfn as_int3(uint4);
+int3 __ovld __cnfn as_int3(long2);
+int3 __ovld __cnfn as_int3(ulong2);
+int3 __ovld __cnfn as_int3(float3);
+int3 __ovld __cnfn as_int3(float4);
+
+int4 __ovld __cnfn as_int4(char16);
+int4 __ovld __cnfn as_int4(uchar16);
+int4 __ovld __cnfn as_int4(short8);
+int4 __ovld __cnfn as_int4(ushort8);
+int4 __ovld __cnfn as_int4(int3);
+int4 __ovld __cnfn as_int4(int4);
+int4 __ovld __cnfn as_int4(uint3);
+int4 __ovld __cnfn as_int4(uint4);
+int4 __ovld __cnfn as_int4(long2);
+int4 __ovld __cnfn as_int4(ulong2);
+int4 __ovld __cnfn as_int4(float3);
+int4 __ovld __cnfn as_int4(float4);
+
+int8 __ovld __cnfn as_int8(short16);
+int8 __ovld __cnfn as_int8(ushort16);
+int8 __ovld __cnfn as_int8(int8);
+int8 __ovld __cnfn as_int8(uint8);
+int8 __ovld __cnfn as_int8(long3);
+int8 __ovld __cnfn as_int8(long4);
+int8 __ovld __cnfn as_int8(ulong3);
+int8 __ovld __cnfn as_int8(ulong4);
+int8 __ovld __cnfn as_int8(float8);
+
+int16 __ovld __cnfn as_int16(int16);
+int16 __ovld __cnfn as_int16(uint16);
+int16 __ovld __cnfn as_int16(long8);
+int16 __ovld __cnfn as_int16(ulong8);
+int16 __ovld __cnfn as_int16(float16);
+
+uint __ovld __cnfn as_uint(char3);
+uint __ovld __cnfn as_uint(char4);
+uint __ovld __cnfn as_uint(uchar3);
+uint __ovld __cnfn as_uint(uchar4);
+uint __ovld __cnfn as_uint(short2);
+uint __ovld __cnfn as_uint(ushort2);
+uint __ovld __cnfn as_uint(int);
+uint __ovld __cnfn as_uint(uint);
+uint __ovld __cnfn as_uint(float);
+
+uint2 __ovld __cnfn as_uint2(char8);
+uint2 __ovld __cnfn as_uint2(uchar8);
+uint2 __ovld __cnfn as_uint2(short3);
+uint2 __ovld __cnfn as_uint2(short4);
+uint2 __ovld __cnfn as_uint2(ushort3);
+uint2 __ovld __cnfn as_uint2(ushort4);
+uint2 __ovld __cnfn as_uint2(int2);
+uint2 __ovld __cnfn as_uint2(uint2);
+uint2 __ovld __cnfn as_uint2(long);
+uint2 __ovld __cnfn as_uint2(ulong);
+uint2 __ovld __cnfn as_uint2(float2);
+
+uint3 __ovld __cnfn as_uint3(char16);
+uint3 __ovld __cnfn as_uint3(uchar16);
+uint3 __ovld __cnfn as_uint3(short8);
+uint3 __ovld __cnfn as_uint3(ushort8);
+uint3 __ovld __cnfn as_uint3(int3);
+uint3 __ovld __cnfn as_uint3(int4);
+uint3 __ovld __cnfn as_uint3(uint3);
+uint3 __ovld __cnfn as_uint3(uint4);
+uint3 __ovld __cnfn as_uint3(long2);
+uint3 __ovld __cnfn as_uint3(ulong2);
+uint3 __ovld __cnfn as_uint3(float3);
+uint3 __ovld __cnfn as_uint3(float4);
+
+uint4 __ovld __cnfn as_uint4(char16);
+uint4 __ovld __cnfn as_uint4(uchar16);
+uint4 __ovld __cnfn as_uint4(short8);
+uint4 __ovld __cnfn as_uint4(ushort8);
+uint4 __ovld __cnfn as_uint4(int3);
+uint4 __ovld __cnfn as_uint4(int4);
+uint4 __ovld __cnfn as_uint4(uint3);
+uint4 __ovld __cnfn as_uint4(uint4);
+uint4 __ovld __cnfn as_uint4(long2);
+uint4 __ovld __cnfn as_uint4(ulong2);
+uint4 __ovld __cnfn as_uint4(float3);
+uint4 __ovld __cnfn as_uint4(float4);
+
+uint8 __ovld __cnfn as_uint8(short16);
+uint8 __ovld __cnfn as_uint8(ushort16);
+uint8 __ovld __cnfn as_uint8(int8);
+uint8 __ovld __cnfn as_uint8(uint8);
+uint8 __ovld __cnfn as_uint8(long3);
+uint8 __ovld __cnfn as_uint8(long4);
+uint8 __ovld __cnfn as_uint8(ulong3);
+uint8 __ovld __cnfn as_uint8(ulong4);
+uint8 __ovld __cnfn as_uint8(float8);
+
+uint16 __ovld __cnfn as_uint16(int16);
+uint16 __ovld __cnfn as_uint16(uint16);
+uint16 __ovld __cnfn as_uint16(long8);
+uint16 __ovld __cnfn as_uint16(ulong8);
+uint16 __ovld __cnfn as_uint16(float16);
+
+long __ovld __cnfn as_long(char8);
+long __ovld __cnfn as_long(uchar8);
+long __ovld __cnfn as_long(short3);
+long __ovld __cnfn as_long(short4);
+long __ovld __cnfn as_long(ushort3);
+long __ovld __cnfn as_long(ushort4);
+long __ovld __cnfn as_long(int2);
+long __ovld __cnfn as_long(uint2);
+long __ovld __cnfn as_long(long);
+long __ovld __cnfn as_long(ulong);
+long __ovld __cnfn as_long(float2);
+
+long2 __ovld __cnfn as_long2(char16);
+long2 __ovld __cnfn as_long2(uchar16);
+long2 __ovld __cnfn as_long2(short8);
+long2 __ovld __cnfn as_long2(ushort8);
+long2 __ovld __cnfn as_long2(int3);
+long2 __ovld __cnfn as_long2(int4);
+long2 __ovld __cnfn as_long2(uint3);
+long2 __ovld __cnfn as_long2(uint4);
+long2 __ovld __cnfn as_long2(long2);
+long2 __ovld __cnfn as_long2(ulong2);
+long2 __ovld __cnfn as_long2(float3);
+long2 __ovld __cnfn as_long2(float4);
+
+long3 __ovld __cnfn as_long3(short16);
+long3 __ovld __cnfn as_long3(ushort16);
+long3 __ovld __cnfn as_long3(int8);
+long3 __ovld __cnfn as_long3(uint8);
+long3 __ovld __cnfn as_long3(long3);
+long3 __ovld __cnfn as_long3(long4);
+long3 __ovld __cnfn as_long3(ulong3);
+long3 __ovld __cnfn as_long3(ulong4);
+long3 __ovld __cnfn as_long3(float8);
+
+long4 __ovld __cnfn as_long4(short16);
+long4 __ovld __cnfn as_long4(ushort16);
+long4 __ovld __cnfn as_long4(int8);
+long4 __ovld __cnfn as_long4(uint8);
+long4 __ovld __cnfn as_long4(long3);
+long4 __ovld __cnfn as_long4(long4);
+long4 __ovld __cnfn as_long4(ulong3);
+long4 __ovld __cnfn as_long4(ulong4);
+long4 __ovld __cnfn as_long4(float8);
+
+long8 __ovld __cnfn as_long8(int16);
+long8 __ovld __cnfn as_long8(uint16);
+long8 __ovld __cnfn as_long8(long8);
+long8 __ovld __cnfn as_long8(ulong8);
+long8 __ovld __cnfn as_long8(float16);
+
+long16 __ovld __cnfn as_long16(long16);
+long16 __ovld __cnfn as_long16(ulong16);
+
+ulong __ovld __cnfn as_ulong(char8);
+ulong __ovld __cnfn as_ulong(uchar8);
+ulong __ovld __cnfn as_ulong(short3);
+ulong __ovld __cnfn as_ulong(short4);
+ulong __ovld __cnfn as_ulong(ushort3);
+ulong __ovld __cnfn as_ulong(ushort4);
+ulong __ovld __cnfn as_ulong(int2);
+ulong __ovld __cnfn as_ulong(uint2);
+ulong __ovld __cnfn as_ulong(long);
+ulong __ovld __cnfn as_ulong(ulong);
+ulong __ovld __cnfn as_ulong(float2);
+
+ulong2 __ovld __cnfn as_ulong2(char16);
+ulong2 __ovld __cnfn as_ulong2(uchar16);
+ulong2 __ovld __cnfn as_ulong2(short8);
+ulong2 __ovld __cnfn as_ulong2(ushort8);
+ulong2 __ovld __cnfn as_ulong2(int3);
+ulong2 __ovld __cnfn as_ulong2(int4);
+ulong2 __ovld __cnfn as_ulong2(uint3);
+ulong2 __ovld __cnfn as_ulong2(uint4);
+ulong2 __ovld __cnfn as_ulong2(long2);
+ulong2 __ovld __cnfn as_ulong2(ulong2);
+ulong2 __ovld __cnfn as_ulong2(float3);
+ulong2 __ovld __cnfn as_ulong2(float4);
+
+ulong3 __ovld __cnfn as_ulong3(short16);
+ulong3 __ovld __cnfn as_ulong3(ushort16);
+ulong3 __ovld __cnfn as_ulong3(int8);
+ulong3 __ovld __cnfn as_ulong3(uint8);
+ulong3 __ovld __cnfn as_ulong3(long3);
+ulong3 __ovld __cnfn as_ulong3(long4);
+ulong3 __ovld __cnfn as_ulong3(ulong3);
+ulong3 __ovld __cnfn as_ulong3(ulong4);
+ulong3 __ovld __cnfn as_ulong3(float8);
+
+ulong4 __ovld __cnfn as_ulong4(short16);
+ulong4 __ovld __cnfn as_ulong4(ushort16);
+ulong4 __ovld __cnfn as_ulong4(int8);
+ulong4 __ovld __cnfn as_ulong4(uint8);
+ulong4 __ovld __cnfn as_ulong4(long3);
+ulong4 __ovld __cnfn as_ulong4(long4);
+ulong4 __ovld __cnfn as_ulong4(ulong3);
+ulong4 __ovld __cnfn as_ulong4(ulong4);
+ulong4 __ovld __cnfn as_ulong4(float8);
+
+ulong8 __ovld __cnfn as_ulong8(int16);
+ulong8 __ovld __cnfn as_ulong8(uint16);
+ulong8 __ovld __cnfn as_ulong8(long8);
+ulong8 __ovld __cnfn as_ulong8(ulong8);
+ulong8 __ovld __cnfn as_ulong8(float16);
+
+ulong16 __ovld __cnfn as_ulong16(long16);
+ulong16 __ovld __cnfn as_ulong16(ulong16);
+
+float __ovld __cnfn as_float(char3);
+float __ovld __cnfn as_float(char4);
+float __ovld __cnfn as_float(uchar3);
+float __ovld __cnfn as_float(uchar4);
+float __ovld __cnfn as_float(short2);
+float __ovld __cnfn as_float(ushort2);
+float __ovld __cnfn as_float(int);
+float __ovld __cnfn as_float(uint);
+float __ovld __cnfn as_float(float);
+
+float2 __ovld __cnfn as_float2(char8);
+float2 __ovld __cnfn as_float2(uchar8);
+float2 __ovld __cnfn as_float2(short3);
+float2 __ovld __cnfn as_float2(short4);
+float2 __ovld __cnfn as_float2(ushort3);
+float2 __ovld __cnfn as_float2(ushort4);
+float2 __ovld __cnfn as_float2(int2);
+float2 __ovld __cnfn as_float2(uint2);
+float2 __ovld __cnfn as_float2(long);
+float2 __ovld __cnfn as_float2(ulong);
+float2 __ovld __cnfn as_float2(float2);
+
+float3 __ovld __cnfn as_float3(char16);
+float3 __ovld __cnfn as_float3(uchar16);
+float3 __ovld __cnfn as_float3(short8);
+float3 __ovld __cnfn as_float3(ushort8);
+float3 __ovld __cnfn as_float3(int3);
+float3 __ovld __cnfn as_float3(int4);
+float3 __ovld __cnfn as_float3(uint3);
+float3 __ovld __cnfn as_float3(uint4);
+float3 __ovld __cnfn as_float3(long2);
+float3 __ovld __cnfn as_float3(ulong2);
+float3 __ovld __cnfn as_float3(float3);
+float3 __ovld __cnfn as_float3(float4);
+
+float4 __ovld __cnfn as_float4(char16);
+float4 __ovld __cnfn as_float4(uchar16);
+float4 __ovld __cnfn as_float4(short8);
+float4 __ovld __cnfn as_float4(ushort8);
+float4 __ovld __cnfn as_float4(int3);
+float4 __ovld __cnfn as_float4(int4);
+float4 __ovld __cnfn as_float4(uint3);
+float4 __ovld __cnfn as_float4(uint4);
+float4 __ovld __cnfn as_float4(long2);
+float4 __ovld __cnfn as_float4(ulong2);
+float4 __ovld __cnfn as_float4(float3);
+float4 __ovld __cnfn as_float4(float4);
+
+float8 __ovld __cnfn as_float8(short16);
+float8 __ovld __cnfn as_float8(ushort16);
+float8 __ovld __cnfn as_float8(int8);
+float8 __ovld __cnfn as_float8(uint8);
+float8 __ovld __cnfn as_float8(long3);
+float8 __ovld __cnfn as_float8(long4);
+float8 __ovld __cnfn as_float8(ulong3);
+float8 __ovld __cnfn as_float8(ulong4);
+float8 __ovld __cnfn as_float8(float8);
+
+float16 __ovld __cnfn as_float16(int16);
+float16 __ovld __cnfn as_float16(uint16);
+float16 __ovld __cnfn as_float16(long8);
+float16 __ovld __cnfn as_float16(ulong8);
+float16 __ovld __cnfn as_float16(float16);
+
+#ifdef cl_khr_fp64
+char8 __ovld __cnfn as_char8(double);
+char16 __ovld __cnfn as_char16(double2);
+uchar8 __ovld __cnfn as_uchar8(double);
+uchar16 __ovld __cnfn as_uchar16(double2);
+short3 __ovld __cnfn as_short3(double);
+short4 __ovld __cnfn as_short4(double);
+short8 __ovld __cnfn as_short8(double2);
+short16 __ovld __cnfn as_short16(double3);
+short16 __ovld __cnfn as_short16(double4);
+ushort3 __ovld __cnfn as_ushort3(double);
+ushort4 __ovld __cnfn as_ushort4(double);
+ushort8 __ovld __cnfn as_ushort8(double2);
+ushort16 __ovld __cnfn as_ushort16(double3);
+ushort16 __ovld __cnfn as_ushort16(double4);
+int2 __ovld __cnfn as_int2(double);
+int3 __ovld __cnfn as_int3(double2);
+int4 __ovld __cnfn as_int4(double2);
+int8 __ovld __cnfn as_int8(double3);
+int8 __ovld __cnfn as_int8(double4);
+int16 __ovld __cnfn as_int16(double8);
+uint2 __ovld __cnfn as_uint2(double);
+uint3 __ovld __cnfn as_uint3(double2);
+uint4 __ovld __cnfn as_uint4(double2);
+uint8 __ovld __cnfn as_uint8(double3);
+uint8 __ovld __cnfn as_uint8(double4);
+uint16 __ovld __cnfn as_uint16(double8);
+long __ovld __cnfn as_long(double);
+long2 __ovld __cnfn as_long2(double2);
+long3 __ovld __cnfn as_long3(double3);
+long3 __ovld __cnfn as_long3(double4);
+long4 __ovld __cnfn as_long4(double3);
+long4 __ovld __cnfn as_long4(double4);
+long8 __ovld __cnfn as_long8(double8);
+long16 __ovld __cnfn as_long16(double16);
+ulong __ovld __cnfn as_ulong(double);
+ulong2 __ovld __cnfn as_ulong2(double2);
+ulong3 __ovld __cnfn as_ulong3(double3);
+ulong3 __ovld __cnfn as_ulong3(double4);
+ulong4 __ovld __cnfn as_ulong4(double3);
+ulong4 __ovld __cnfn as_ulong4(double4);
+ulong8 __ovld __cnfn as_ulong8(double8);
+ulong16 __ovld __cnfn as_ulong16(double16);
+float2 __ovld __cnfn as_float2(double);
+float3 __ovld __cnfn as_float3(double2);
+float4 __ovld __cnfn as_float4(double2);
+float8 __ovld __cnfn as_float8(double3);
+float8 __ovld __cnfn as_float8(double4);
+float16 __ovld __cnfn as_float16(double8);
+double __ovld __cnfn as_double(char8);
+double __ovld __cnfn as_double(uchar8);
+double __ovld __cnfn as_double(short3);
+double __ovld __cnfn as_double(short4);
+double __ovld __cnfn as_double(ushort3);
+double __ovld __cnfn as_double(ushort4);
+double __ovld __cnfn as_double(int2);
+double __ovld __cnfn as_double(uint2);
+double __ovld __cnfn as_double(long);
+double __ovld __cnfn as_double(ulong);
+double __ovld __cnfn as_double(float2);
+double __ovld __cnfn as_double(double);
+double2 __ovld __cnfn as_double2(char16);
+double2 __ovld __cnfn as_double2(uchar16);
+double2 __ovld __cnfn as_double2(short8);
+double2 __ovld __cnfn as_double2(ushort8);
+double2 __ovld __cnfn as_double2(int3);
+double2 __ovld __cnfn as_double2(int4);
+double2 __ovld __cnfn as_double2(uint3);
+double2 __ovld __cnfn as_double2(uint4);
+double2 __ovld __cnfn as_double2(long2);
+double2 __ovld __cnfn as_double2(ulong2);
+double2 __ovld __cnfn as_double2(float3);
+double2 __ovld __cnfn as_double2(float4);
+double2 __ovld __cnfn as_double2(double2);
+double3 __ovld __cnfn as_double3(short16);
+double3 __ovld __cnfn as_double3(ushort16);
+double3 __ovld __cnfn as_double3(int8);
+double3 __ovld __cnfn as_double3(uint8);
+double3 __ovld __cnfn as_double3(long3);
+double3 __ovld __cnfn as_double3(long4);
+double3 __ovld __cnfn as_double3(ulong3);
+double3 __ovld __cnfn as_double3(ulong4);
+double3 __ovld __cnfn as_double3(float8);
+double3 __ovld __cnfn as_double3(double3);
+double3 __ovld __cnfn as_double3(double4);
+double4 __ovld __cnfn as_double4(short16);
+double4 __ovld __cnfn as_double4(ushort16);
+double4 __ovld __cnfn as_double4(int8);
+double4 __ovld __cnfn as_double4(uint8);
+double4 __ovld __cnfn as_double4(long3);
+double4 __ovld __cnfn as_double4(long4);
+double4 __ovld __cnfn as_double4(ulong3);
+double4 __ovld __cnfn as_double4(ulong4);
+double4 __ovld __cnfn as_double4(float8);
+double4 __ovld __cnfn as_double4(double3);
+double4 __ovld __cnfn as_double4(double4);
+double8 __ovld __cnfn as_double8(int16);
+double8 __ovld __cnfn as_double8(uint16);
+double8 __ovld __cnfn as_double8(long8);
+double8 __ovld __cnfn as_double8(ulong8);
+double8 __ovld __cnfn as_double8(float16);
+double8 __ovld __cnfn as_double8(double8);
+double16 __ovld __cnfn as_double16(long16);
+double16 __ovld __cnfn as_double16(ulong16);
+double16 __ovld __cnfn as_double16(double16);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+char2 __ovld __cnfn as_char2(half);
+char3 __ovld __cnfn as_char3(half2);
+char4 __ovld __cnfn as_char4(half2);
+char8 __ovld __cnfn as_char8(half3);
+char8 __ovld __cnfn as_char8(half4);
+char16 __ovld __cnfn as_char16(half8);
+uchar2 __ovld __cnfn as_uchar2(half);
+uchar3 __ovld __cnfn as_uchar3(half2);
+uchar4 __ovld __cnfn as_uchar4(half2);
+uchar8 __ovld __cnfn as_uchar8(half3);
+uchar8 __ovld __cnfn as_uchar8(half4);
+uchar16 __ovld __cnfn as_uchar16(half8);
+short __ovld __cnfn as_short(half);
+short2 __ovld __cnfn as_short2(half2);
+short3 __ovld __cnfn as_short3(half3);
+short3 __ovld __cnfn as_short3(half4);
+short4 __ovld __cnfn as_short4(half3);
+short4 __ovld __cnfn as_short4(half4);
+short8 __ovld __cnfn as_short8(half8);
+short16 __ovld __cnfn as_short16(half16);
+ushort __ovld __cnfn as_ushort(half);
+ushort2 __ovld __cnfn as_ushort2(half2);
+ushort3 __ovld __cnfn as_ushort3(half3);
+ushort3 __ovld __cnfn as_ushort3(half4);
+ushort4 __ovld __cnfn as_ushort4(half3);
+ushort4 __ovld __cnfn as_ushort4(half4);
+ushort8 __ovld __cnfn as_ushort8(half8);
+ushort16 __ovld __cnfn as_ushort16(half16);
+int __ovld __cnfn as_int(half2);
+int2 __ovld __cnfn as_int2(half3);
+int2 __ovld __cnfn as_int2(half4);
+int3 __ovld __cnfn as_int3(half8);
+int4 __ovld __cnfn as_int4(half8);
+int8 __ovld __cnfn as_int8(half16);
+uint __ovld __cnfn as_uint(half2);
+uint2 __ovld __cnfn as_uint2(half3);
+uint2 __ovld __cnfn as_uint2(half4);
+uint3 __ovld __cnfn as_uint3(half8);
+uint4 __ovld __cnfn as_uint4(half8);
+uint8 __ovld __cnfn as_uint8(half16);
+long __ovld __cnfn as_long(half3);
+long __ovld __cnfn as_long(half4);
+long2 __ovld __cnfn as_long2(half8);
+long3 __ovld __cnfn as_long3(half16);
+long4 __ovld __cnfn as_long4(half16);
+ulong __ovld __cnfn as_ulong(half3);
+ulong __ovld __cnfn as_ulong(half4);
+ulong2 __ovld __cnfn as_ulong2(half8);
+ulong3 __ovld __cnfn as_ulong3(half16);
+ulong4 __ovld __cnfn as_ulong4(half16);
+half __ovld __cnfn as_half(char2);
+half __ovld __cnfn as_half(uchar2);
+half __ovld __cnfn as_half(short);
+half __ovld __cnfn as_half(ushort);
+half __ovld __cnfn as_half(half);
+half2 __ovld __cnfn as_half2(char3);
+half2 __ovld __cnfn as_half2(char4);
+half2 __ovld __cnfn as_half2(uchar3);
+half2 __ovld __cnfn as_half2(uchar4);
+half2 __ovld __cnfn as_half2(short2);
+half2 __ovld __cnfn as_half2(ushort2);
+half2 __ovld __cnfn as_half2(int);
+half2 __ovld __cnfn as_half2(uint);
+half2 __ovld __cnfn as_half2(half2);
+half2 __ovld __cnfn as_half2(float);
+half3 __ovld __cnfn as_half3(char8);
+half3 __ovld __cnfn as_half3(uchar8);
+half3 __ovld __cnfn as_half3(short3);
+half3 __ovld __cnfn as_half3(short4);
+half3 __ovld __cnfn as_half3(ushort3);
+half3 __ovld __cnfn as_half3(ushort4);
+half3 __ovld __cnfn as_half3(int2);
+half3 __ovld __cnfn as_half3(uint2);
+half3 __ovld __cnfn as_half3(long);
+half3 __ovld __cnfn as_half3(ulong);
+half3 __ovld __cnfn as_half3(half3);
+half3 __ovld __cnfn as_half3(half4);
+half3 __ovld __cnfn as_half3(float2);
+half4 __ovld __cnfn as_half4(char8);
+half4 __ovld __cnfn as_half4(uchar8);
+half4 __ovld __cnfn as_half4(short3);
+half4 __ovld __cnfn as_half4(short4);
+half4 __ovld __cnfn as_half4(ushort3);
+half4 __ovld __cnfn as_half4(ushort4);
+half4 __ovld __cnfn as_half4(int2);
+half4 __ovld __cnfn as_half4(uint2);
+half4 __ovld __cnfn as_half4(long);
+half4 __ovld __cnfn as_half4(ulong);
+half4 __ovld __cnfn as_half4(half3);
+half4 __ovld __cnfn as_half4(half4);
+half4 __ovld __cnfn as_half4(float2);
+half8 __ovld __cnfn as_half8(char16);
+half8 __ovld __cnfn as_half8(uchar16);
+half8 __ovld __cnfn as_half8(short8);
+half8 __ovld __cnfn as_half8(ushort8);
+half8 __ovld __cnfn as_half8(int3);
+half8 __ovld __cnfn as_half8(int4);
+half8 __ovld __cnfn as_half8(uint3);
+half8 __ovld __cnfn as_half8(uint4);
+half8 __ovld __cnfn as_half8(long2);
+half8 __ovld __cnfn as_half8(ulong2);
+half8 __ovld __cnfn as_half8(half8);
+half8 __ovld __cnfn as_half8(float3);
+half8 __ovld __cnfn as_half8(float4);
+half16 __ovld __cnfn as_half16(short16);
+half16 __ovld __cnfn as_half16(ushort16);
+half16 __ovld __cnfn as_half16(int8);
+half16 __ovld __cnfn as_half16(uint8);
+half16 __ovld __cnfn as_half16(long3);
+half16 __ovld __cnfn as_half16(long4);
+half16 __ovld __cnfn as_half16(ulong3);
+half16 __ovld __cnfn as_half16(ulong4);
+half16 __ovld __cnfn as_half16(half16);
+half16 __ovld __cnfn as_half16(float8);
+float __ovld __cnfn as_float(half2);
+float2 __ovld __cnfn as_float2(half3);
+float2 __ovld __cnfn as_float2(half4);
+float3 __ovld __cnfn as_float3(half8);
+float4 __ovld __cnfn as_float4(half8);
+float8 __ovld __cnfn as_float8(half16);
+
+#ifdef cl_khr_fp64
+half3 __ovld __cnfn as_half3(double);
+half4 __ovld __cnfn as_half4(double);
+half8 __ovld __cnfn as_half8(double2);
+half16 __ovld __cnfn as_half16(double3);
+half16 __ovld __cnfn as_half16(double4);
+double __ovld __cnfn as_double(half3);
+double __ovld __cnfn as_double(half4);
+double2 __ovld __cnfn as_double2(half8);
+double3 __ovld __cnfn as_double3(half16);
+double4 __ovld __cnfn as_double4(half16);
+#endif //cl_khr_fp64
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
+
+#define __kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+#define kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+// OpenCL v1.1 s6.11.1, v1.2 s6.12.1, v2.0 s6.13.1 - Work-item Functions
+
+/**
+ * Returns the number of dimensions in use. This is the
+ * value given to the work_dim argument specified in
+ * clEnqueueNDRangeKernel.
+ * For clEnqueueTask, this returns 1.
+ */
+uint __ovld __cnfn get_work_dim(void);
+
+/**
+ * Returns the number of global work-items specified for
+ * dimension identified by dimindx. This value is given by
+ * the global_work_size argument to
+ * clEnqueueNDRangeKernel. Valid values of dimindx
+ * are 0 to get_work_dim() - 1. For other values of
+ * dimindx, get_global_size() returns 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_global_size(uint dimindx);
+
+/**
+ * Returns the unique global work-item ID value for
+ * dimension identified by dimindx. The global work-item
+ * ID specifies the work-item ID based on the number of
+ * global work-items specified to execute the kernel. Valid
+ * values of dimindx are 0 to get_work_dim() - 1. For
+ * other values of dimindx, get_global_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_global_id(uint dimindx);
+
+/**
+ * Returns the number of local work-items specified in
+ * dimension identified by dimindx. This value is given by
+ * the local_work_size argument to
+ * clEnqueueNDRangeKernel if local_work_size is not
+ * NULL; otherwise the OpenCL implementation chooses
+ * an appropriate local_work_size value which is returned
+ * by this function. Valid values of dimindx are 0 to
+ * get_work_dim() - 1. For other values of dimindx,
+ * get_local_size() returns 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_local_size(uint dimindx);
+
+/**
+ * Returns the unique local work-item ID i.e. a work-item
+ * within a specific work-group for dimension identified by
+ * dimindx. Valid values of dimindx are 0 to
+ * get_work_dim() - 1. For other values of dimindx,
+ * get_local_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_local_id(uint dimindx);
+
+/**
+ * Returns the number of work-groups that will execute a
+ * kernel for dimension identified by dimindx.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values of dimindx, get_num_groups () returns
+ * 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_num_groups(uint dimindx);
+
+/**
+ * get_group_id returns the work-group ID which is a
+ * number from 0 .. get_num_groups(dimindx) - 1.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values, get_group_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_group_id(uint dimindx);
+
+/**
+ * get_global_offset returns the offset values specified in
+ * global_work_offset argument to
+ * clEnqueueNDRangeKernel.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values, get_global_offset() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_global_offset(uint dimindx);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+size_t __ovld get_enqueued_local_size(uint dimindx);
+size_t __ovld get_global_linear_id(void);
+size_t __ovld get_local_linear_id(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.2, v1.2 s6.12.2, v2.0 s6.13.2 - Math functions
+
+/**
+ * Arc cosine function.
+ */
+float __ovld __cnfn acos(float);
+float2 __ovld __cnfn acos(float2);
+float3 __ovld __cnfn acos(float3);
+float4 __ovld __cnfn acos(float4);
+float8 __ovld __cnfn acos(float8);
+float16 __ovld __cnfn acos(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acos(double);
+double2 __ovld __cnfn acos(double2);
+double3 __ovld __cnfn acos(double3);
+double4 __ovld __cnfn acos(double4);
+double8 __ovld __cnfn acos(double8);
+double16 __ovld __cnfn acos(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acos(half);
+half2 __ovld __cnfn acos(half2);
+half3 __ovld __cnfn acos(half3);
+half4 __ovld __cnfn acos(half4);
+half8 __ovld __cnfn acos(half8);
+half16 __ovld __cnfn acos(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Inverse hyperbolic cosine.
+ */
+float __ovld __cnfn acosh(float);
+float2 __ovld __cnfn acosh(float2);
+float3 __ovld __cnfn acosh(float3);
+float4 __ovld __cnfn acosh(float4);
+float8 __ovld __cnfn acosh(float8);
+float16 __ovld __cnfn acosh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acosh(double);
+double2 __ovld __cnfn acosh(double2);
+double3 __ovld __cnfn acosh(double3);
+double4 __ovld __cnfn acosh(double4);
+double8 __ovld __cnfn acosh(double8);
+double16 __ovld __cnfn acosh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acosh(half);
+half2 __ovld __cnfn acosh(half2);
+half3 __ovld __cnfn acosh(half3);
+half4 __ovld __cnfn acosh(half4);
+half8 __ovld __cnfn acosh(half8);
+half16 __ovld __cnfn acosh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute acos (x) / PI.
+ */
+float __ovld __cnfn acospi(float x);
+float2 __ovld __cnfn acospi(float2 x);
+float3 __ovld __cnfn acospi(float3 x);
+float4 __ovld __cnfn acospi(float4 x);
+float8 __ovld __cnfn acospi(float8 x);
+float16 __ovld __cnfn acospi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acospi(double x);
+double2 __ovld __cnfn acospi(double2 x);
+double3 __ovld __cnfn acospi(double3 x);
+double4 __ovld __cnfn acospi(double4 x);
+double8 __ovld __cnfn acospi(double8 x);
+double16 __ovld __cnfn acospi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acospi(half x);
+half2 __ovld __cnfn acospi(half2 x);
+half3 __ovld __cnfn acospi(half3 x);
+half4 __ovld __cnfn acospi(half4 x);
+half8 __ovld __cnfn acospi(half8 x);
+half16 __ovld __cnfn acospi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc sine function.
+ */
+float __ovld __cnfn asin(float);
+float2 __ovld __cnfn asin(float2);
+float3 __ovld __cnfn asin(float3);
+float4 __ovld __cnfn asin(float4);
+float8 __ovld __cnfn asin(float8);
+float16 __ovld __cnfn asin(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asin(double);
+double2 __ovld __cnfn asin(double2);
+double3 __ovld __cnfn asin(double3);
+double4 __ovld __cnfn asin(double4);
+double8 __ovld __cnfn asin(double8);
+double16 __ovld __cnfn asin(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asin(half);
+half2 __ovld __cnfn asin(half2);
+half3 __ovld __cnfn asin(half3);
+half4 __ovld __cnfn asin(half4);
+half8 __ovld __cnfn asin(half8);
+half16 __ovld __cnfn asin(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Inverse hyperbolic sine.
+ */
+float __ovld __cnfn asinh(float);
+float2 __ovld __cnfn asinh(float2);
+float3 __ovld __cnfn asinh(float3);
+float4 __ovld __cnfn asinh(float4);
+float8 __ovld __cnfn asinh(float8);
+float16 __ovld __cnfn asinh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asinh(double);
+double2 __ovld __cnfn asinh(double2);
+double3 __ovld __cnfn asinh(double3);
+double4 __ovld __cnfn asinh(double4);
+double8 __ovld __cnfn asinh(double8);
+double16 __ovld __cnfn asinh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asinh(half);
+half2 __ovld __cnfn asinh(half2);
+half3 __ovld __cnfn asinh(half3);
+half4 __ovld __cnfn asinh(half4);
+half8 __ovld __cnfn asinh(half8);
+half16 __ovld __cnfn asinh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute asin (x) / PI.
+ */
+float __ovld __cnfn asinpi(float x);
+float2 __ovld __cnfn asinpi(float2 x);
+float3 __ovld __cnfn asinpi(float3 x);
+float4 __ovld __cnfn asinpi(float4 x);
+float8 __ovld __cnfn asinpi(float8 x);
+float16 __ovld __cnfn asinpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asinpi(double x);
+double2 __ovld __cnfn asinpi(double2 x);
+double3 __ovld __cnfn asinpi(double3 x);
+double4 __ovld __cnfn asinpi(double4 x);
+double8 __ovld __cnfn asinpi(double8 x);
+double16 __ovld __cnfn asinpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asinpi(half x);
+half2 __ovld __cnfn asinpi(half2 x);
+half3 __ovld __cnfn asinpi(half3 x);
+half4 __ovld __cnfn asinpi(half4 x);
+half8 __ovld __cnfn asinpi(half8 x);
+half16 __ovld __cnfn asinpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc tangent function.
+ */
+float __ovld __cnfn atan(float y_over_x);
+float2 __ovld __cnfn atan(float2 y_over_x);
+float3 __ovld __cnfn atan(float3 y_over_x);
+float4 __ovld __cnfn atan(float4 y_over_x);
+float8 __ovld __cnfn atan(float8 y_over_x);
+float16 __ovld __cnfn atan(float16 y_over_x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan(double y_over_x);
+double2 __ovld __cnfn atan(double2 y_over_x);
+double3 __ovld __cnfn atan(double3 y_over_x);
+double4 __ovld __cnfn atan(double4 y_over_x);
+double8 __ovld __cnfn atan(double8 y_over_x);
+double16 __ovld __cnfn atan(double16 y_over_x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan(half y_over_x);
+half2 __ovld __cnfn atan(half2 y_over_x);
+half3 __ovld __cnfn atan(half3 y_over_x);
+half4 __ovld __cnfn atan(half4 y_over_x);
+half8 __ovld __cnfn atan(half8 y_over_x);
+half16 __ovld __cnfn atan(half16 y_over_x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc tangent of y / x.
+ */
+float __ovld __cnfn atan2(float y, float x);
+float2 __ovld __cnfn atan2(float2 y, float2 x);
+float3 __ovld __cnfn atan2(float3 y, float3 x);
+float4 __ovld __cnfn atan2(float4 y, float4 x);
+float8 __ovld __cnfn atan2(float8 y, float8 x);
+float16 __ovld __cnfn atan2(float16 y, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan2(double y, double x);
+double2 __ovld __cnfn atan2(double2 y, double2 x);
+double3 __ovld __cnfn atan2(double3 y, double3 x);
+double4 __ovld __cnfn atan2(double4 y, double4 x);
+double8 __ovld __cnfn atan2(double8 y, double8 x);
+double16 __ovld __cnfn atan2(double16 y, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan2(half y, half x);
+half2 __ovld __cnfn atan2(half2 y, half2 x);
+half3 __ovld __cnfn atan2(half3 y, half3 x);
+half4 __ovld __cnfn atan2(half4 y, half4 x);
+half8 __ovld __cnfn atan2(half8 y, half8 x);
+half16 __ovld __cnfn atan2(half16 y, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Hyperbolic arc tangent.
+ */
+float __ovld __cnfn atanh(float);
+float2 __ovld __cnfn atanh(float2);
+float3 __ovld __cnfn atanh(float3);
+float4 __ovld __cnfn atanh(float4);
+float8 __ovld __cnfn atanh(float8);
+float16 __ovld __cnfn atanh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atanh(double);
+double2 __ovld __cnfn atanh(double2);
+double3 __ovld __cnfn atanh(double3);
+double4 __ovld __cnfn atanh(double4);
+double8 __ovld __cnfn atanh(double8);
+double16 __ovld __cnfn atanh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atanh(half);
+half2 __ovld __cnfn atanh(half2);
+half3 __ovld __cnfn atanh(half3);
+half4 __ovld __cnfn atanh(half4);
+half8 __ovld __cnfn atanh(half8);
+half16 __ovld __cnfn atanh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute atan (x) / PI.
+ */
+float __ovld __cnfn atanpi(float x);
+float2 __ovld __cnfn atanpi(float2 x);
+float3 __ovld __cnfn atanpi(float3 x);
+float4 __ovld __cnfn atanpi(float4 x);
+float8 __ovld __cnfn atanpi(float8 x);
+float16 __ovld __cnfn atanpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atanpi(double x);
+double2 __ovld __cnfn atanpi(double2 x);
+double3 __ovld __cnfn atanpi(double3 x);
+double4 __ovld __cnfn atanpi(double4 x);
+double8 __ovld __cnfn atanpi(double8 x);
+double16 __ovld __cnfn atanpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atanpi(half x);
+half2 __ovld __cnfn atanpi(half2 x);
+half3 __ovld __cnfn atanpi(half3 x);
+half4 __ovld __cnfn atanpi(half4 x);
+half8 __ovld __cnfn atanpi(half8 x);
+half16 __ovld __cnfn atanpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute atan2 (y, x) / PI.
+ */
+float __ovld __cnfn atan2pi(float y, float x);
+float2 __ovld __cnfn atan2pi(float2 y, float2 x);
+float3 __ovld __cnfn atan2pi(float3 y, float3 x);
+float4 __ovld __cnfn atan2pi(float4 y, float4 x);
+float8 __ovld __cnfn atan2pi(float8 y, float8 x);
+float16 __ovld __cnfn atan2pi(float16 y, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan2pi(double y, double x);
+double2 __ovld __cnfn atan2pi(double2 y, double2 x);
+double3 __ovld __cnfn atan2pi(double3 y, double3 x);
+double4 __ovld __cnfn atan2pi(double4 y, double4 x);
+double8 __ovld __cnfn atan2pi(double8 y, double8 x);
+double16 __ovld __cnfn atan2pi(double16 y, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan2pi(half y, half x);
+half2 __ovld __cnfn atan2pi(half2 y, half2 x);
+half3 __ovld __cnfn atan2pi(half3 y, half3 x);
+half4 __ovld __cnfn atan2pi(half4 y, half4 x);
+half8 __ovld __cnfn atan2pi(half8 y, half8 x);
+half16 __ovld __cnfn atan2pi(half16 y, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cube-root.
+ */
+float __ovld __cnfn cbrt(float);
+float2 __ovld __cnfn cbrt(float2);
+float3 __ovld __cnfn cbrt(float3);
+float4 __ovld __cnfn cbrt(float4);
+float8 __ovld __cnfn cbrt(float8);
+float16 __ovld __cnfn cbrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cbrt(double);
+double2 __ovld __cnfn cbrt(double2);
+double3 __ovld __cnfn cbrt(double3);
+double4 __ovld __cnfn cbrt(double4);
+double8 __ovld __cnfn cbrt(double8);
+double16 __ovld __cnfn cbrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cbrt(half);
+half2 __ovld __cnfn cbrt(half2);
+half3 __ovld __cnfn cbrt(half3);
+half4 __ovld __cnfn cbrt(half4);
+half8 __ovld __cnfn cbrt(half8);
+half16 __ovld __cnfn cbrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to positive
+ * infinity rounding mode.
+ */
+float __ovld __cnfn ceil(float);
+float2 __ovld __cnfn ceil(float2);
+float3 __ovld __cnfn ceil(float3);
+float4 __ovld __cnfn ceil(float4);
+float8 __ovld __cnfn ceil(float8);
+float16 __ovld __cnfn ceil(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn ceil(double);
+double2 __ovld __cnfn ceil(double2);
+double3 __ovld __cnfn ceil(double3);
+double4 __ovld __cnfn ceil(double4);
+double8 __ovld __cnfn ceil(double8);
+double16 __ovld __cnfn ceil(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn ceil(half);
+half2 __ovld __cnfn ceil(half2);
+half3 __ovld __cnfn ceil(half3);
+half4 __ovld __cnfn ceil(half4);
+half8 __ovld __cnfn ceil(half8);
+half16 __ovld __cnfn ceil(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x with its sign changed to match the sign of y.
+ */
+float __ovld __cnfn copysign(float x, float y);
+float2 __ovld __cnfn copysign(float2 x, float2 y);
+float3 __ovld __cnfn copysign(float3 x, float3 y);
+float4 __ovld __cnfn copysign(float4 x, float4 y);
+float8 __ovld __cnfn copysign(float8 x, float8 y);
+float16 __ovld __cnfn copysign(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn copysign(double x, double y);
+double2 __ovld __cnfn copysign(double2 x, double2 y);
+double3 __ovld __cnfn copysign(double3 x, double3 y);
+double4 __ovld __cnfn copysign(double4 x, double4 y);
+double8 __ovld __cnfn copysign(double8 x, double8 y);
+double16 __ovld __cnfn copysign(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn copysign(half x, half y);
+half2 __ovld __cnfn copysign(half2 x, half2 y);
+half3 __ovld __cnfn copysign(half3 x, half3 y);
+half4 __ovld __cnfn copysign(half4 x, half4 y);
+half8 __ovld __cnfn copysign(half8 x, half8 y);
+half16 __ovld __cnfn copysign(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cosine.
+ */
+float __ovld __cnfn cos(float);
+float2 __ovld __cnfn cos(float2);
+float3 __ovld __cnfn cos(float3);
+float4 __ovld __cnfn cos(float4);
+float8 __ovld __cnfn cos(float8);
+float16 __ovld __cnfn cos(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cos(double);
+double2 __ovld __cnfn cos(double2);
+double3 __ovld __cnfn cos(double3);
+double4 __ovld __cnfn cos(double4);
+double8 __ovld __cnfn cos(double8);
+double16 __ovld __cnfn cos(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cos(half);
+half2 __ovld __cnfn cos(half2);
+half3 __ovld __cnfn cos(half3);
+half4 __ovld __cnfn cos(half4);
+half8 __ovld __cnfn cos(half8);
+half16 __ovld __cnfn cos(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute hyperbolic cosine.
+ */
+float __ovld __cnfn cosh(float);
+float2 __ovld __cnfn cosh(float2);
+float3 __ovld __cnfn cosh(float3);
+float4 __ovld __cnfn cosh(float4);
+float8 __ovld __cnfn cosh(float8);
+float16 __ovld __cnfn cosh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cosh(double);
+double2 __ovld __cnfn cosh(double2);
+double3 __ovld __cnfn cosh(double3);
+double4 __ovld __cnfn cosh(double4);
+double8 __ovld __cnfn cosh(double8);
+double16 __ovld __cnfn cosh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cosh(half);
+half2 __ovld __cnfn cosh(half2);
+half3 __ovld __cnfn cosh(half3);
+half4 __ovld __cnfn cosh(half4);
+half8 __ovld __cnfn cosh(half8);
+half16 __ovld __cnfn cosh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cos (PI * x).
+ */
+float __ovld __cnfn cospi(float x);
+float2 __ovld __cnfn cospi(float2 x);
+float3 __ovld __cnfn cospi(float3 x);
+float4 __ovld __cnfn cospi(float4 x);
+float8 __ovld __cnfn cospi(float8 x);
+float16 __ovld __cnfn cospi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cospi(double x);
+double2 __ovld __cnfn cospi(double2 x);
+double3 __ovld __cnfn cospi(double3 x);
+double4 __ovld __cnfn cospi(double4 x);
+double8 __ovld __cnfn cospi(double8 x);
+double16 __ovld __cnfn cospi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cospi(half x);
+half2 __ovld __cnfn cospi(half2 x);
+half3 __ovld __cnfn cospi(half3 x);
+half4 __ovld __cnfn cospi(half4 x);
+half8 __ovld __cnfn cospi(half8 x);
+half16 __ovld __cnfn cospi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Complementary error function.
+ */
+float __ovld __cnfn erfc(float);
+float2 __ovld __cnfn erfc(float2);
+float3 __ovld __cnfn erfc(float3);
+float4 __ovld __cnfn erfc(float4);
+float8 __ovld __cnfn erfc(float8);
+float16 __ovld __cnfn erfc(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn erfc(double);
+double2 __ovld __cnfn erfc(double2);
+double3 __ovld __cnfn erfc(double3);
+double4 __ovld __cnfn erfc(double4);
+double8 __ovld __cnfn erfc(double8);
+double16 __ovld __cnfn erfc(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn erfc(half);
+half2 __ovld __cnfn erfc(half2);
+half3 __ovld __cnfn erfc(half3);
+half4 __ovld __cnfn erfc(half4);
+half8 __ovld __cnfn erfc(half8);
+half16 __ovld __cnfn erfc(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Error function encountered in integrating the
+ * normal distribution.
+ */
+float __ovld __cnfn erf(float);
+float2 __ovld __cnfn erf(float2);
+float3 __ovld __cnfn erf(float3);
+float4 __ovld __cnfn erf(float4);
+float8 __ovld __cnfn erf(float8);
+float16 __ovld __cnfn erf(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn erf(double);
+double2 __ovld __cnfn erf(double2);
+double3 __ovld __cnfn erf(double3);
+double4 __ovld __cnfn erf(double4);
+double8 __ovld __cnfn erf(double8);
+double16 __ovld __cnfn erf(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn erf(half);
+half2 __ovld __cnfn erf(half2);
+half3 __ovld __cnfn erf(half3);
+half4 __ovld __cnfn erf(half4);
+half8 __ovld __cnfn erf(half8);
+half16 __ovld __cnfn erf(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the base e exponential function of x.
+ */
+float __ovld __cnfn exp(float x);
+float2 __ovld __cnfn exp(float2 x);
+float3 __ovld __cnfn exp(float3 x);
+float4 __ovld __cnfn exp(float4 x);
+float8 __ovld __cnfn exp(float8 x);
+float16 __ovld __cnfn exp(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp(double x);
+double2 __ovld __cnfn exp(double2 x);
+double3 __ovld __cnfn exp(double3 x);
+double4 __ovld __cnfn exp(double4 x);
+double8 __ovld __cnfn exp(double8 x);
+double16 __ovld __cnfn exp(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp(half x);
+half2 __ovld __cnfn exp(half2 x);
+half3 __ovld __cnfn exp(half3 x);
+half4 __ovld __cnfn exp(half4 x);
+half8 __ovld __cnfn exp(half8 x);
+half16 __ovld __cnfn exp(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Exponential base 2 function.
+ */
+float __ovld __cnfn exp2(float);
+float2 __ovld __cnfn exp2(float2);
+float3 __ovld __cnfn exp2(float3);
+float4 __ovld __cnfn exp2(float4);
+float8 __ovld __cnfn exp2(float8);
+float16 __ovld __cnfn exp2(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp2(double);
+double2 __ovld __cnfn exp2(double2);
+double3 __ovld __cnfn exp2(double3);
+double4 __ovld __cnfn exp2(double4);
+double8 __ovld __cnfn exp2(double8);
+double16 __ovld __cnfn exp2(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp2(half);
+half2 __ovld __cnfn exp2(half2);
+half3 __ovld __cnfn exp2(half3);
+half4 __ovld __cnfn exp2(half4);
+half8 __ovld __cnfn exp2(half8);
+half16 __ovld __cnfn exp2(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Exponential base 10 function.
+ */
+float __ovld __cnfn exp10(float);
+float2 __ovld __cnfn exp10(float2);
+float3 __ovld __cnfn exp10(float3);
+float4 __ovld __cnfn exp10(float4);
+float8 __ovld __cnfn exp10(float8);
+float16 __ovld __cnfn exp10(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp10(double);
+double2 __ovld __cnfn exp10(double2);
+double3 __ovld __cnfn exp10(double3);
+double4 __ovld __cnfn exp10(double4);
+double8 __ovld __cnfn exp10(double8);
+double16 __ovld __cnfn exp10(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp10(half);
+half2 __ovld __cnfn exp10(half2);
+half3 __ovld __cnfn exp10(half3);
+half4 __ovld __cnfn exp10(half4);
+half8 __ovld __cnfn exp10(half8);
+half16 __ovld __cnfn exp10(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute e^x- 1.0.
+ */
+float __ovld __cnfn expm1(float x);
+float2 __ovld __cnfn expm1(float2 x);
+float3 __ovld __cnfn expm1(float3 x);
+float4 __ovld __cnfn expm1(float4 x);
+float8 __ovld __cnfn expm1(float8 x);
+float16 __ovld __cnfn expm1(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn expm1(double x);
+double2 __ovld __cnfn expm1(double2 x);
+double3 __ovld __cnfn expm1(double3 x);
+double4 __ovld __cnfn expm1(double4 x);
+double8 __ovld __cnfn expm1(double8 x);
+double16 __ovld __cnfn expm1(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn expm1(half x);
+half2 __ovld __cnfn expm1(half2 x);
+half3 __ovld __cnfn expm1(half3 x);
+half4 __ovld __cnfn expm1(half4 x);
+half8 __ovld __cnfn expm1(half8 x);
+half16 __ovld __cnfn expm1(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute absolute value of a floating-point number.
+ */
+float __ovld __cnfn fabs(float);
+float2 __ovld __cnfn fabs(float2);
+float3 __ovld __cnfn fabs(float3);
+float4 __ovld __cnfn fabs(float4);
+float8 __ovld __cnfn fabs(float8);
+float16 __ovld __cnfn fabs(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fabs(double);
+double2 __ovld __cnfn fabs(double2);
+double3 __ovld __cnfn fabs(double3);
+double4 __ovld __cnfn fabs(double4);
+double8 __ovld __cnfn fabs(double8);
+double16 __ovld __cnfn fabs(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fabs(half);
+half2 __ovld __cnfn fabs(half2);
+half3 __ovld __cnfn fabs(half3);
+half4 __ovld __cnfn fabs(half4);
+half8 __ovld __cnfn fabs(half8);
+half16 __ovld __cnfn fabs(half16);
+#endif //cl_khr_fp16
+
+/**
+ * x - y if x > y, +0 if x is less than or equal to y.
+ */
+float __ovld __cnfn fdim(float x, float y);
+float2 __ovld __cnfn fdim(float2 x, float2 y);
+float3 __ovld __cnfn fdim(float3 x, float3 y);
+float4 __ovld __cnfn fdim(float4 x, float4 y);
+float8 __ovld __cnfn fdim(float8 x, float8 y);
+float16 __ovld __cnfn fdim(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fdim(double x, double y);
+double2 __ovld __cnfn fdim(double2 x, double2 y);
+double3 __ovld __cnfn fdim(double3 x, double3 y);
+double4 __ovld __cnfn fdim(double4 x, double4 y);
+double8 __ovld __cnfn fdim(double8 x, double8 y);
+double16 __ovld __cnfn fdim(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fdim(half x, half y);
+half2 __ovld __cnfn fdim(half2 x, half2 y);
+half3 __ovld __cnfn fdim(half3 x, half3 y);
+half4 __ovld __cnfn fdim(half4 x, half4 y);
+half8 __ovld __cnfn fdim(half8 x, half8 y);
+half16 __ovld __cnfn fdim(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to -ve
+ * infinity rounding mode.
+ */
+float __ovld __cnfn floor(float);
+float2 __ovld __cnfn floor(float2);
+float3 __ovld __cnfn floor(float3);
+float4 __ovld __cnfn floor(float4);
+float8 __ovld __cnfn floor(float8);
+float16 __ovld __cnfn floor(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn floor(double);
+double2 __ovld __cnfn floor(double2);
+double3 __ovld __cnfn floor(double3);
+double4 __ovld __cnfn floor(double4);
+double8 __ovld __cnfn floor(double8);
+double16 __ovld __cnfn floor(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn floor(half);
+half2 __ovld __cnfn floor(half2);
+half3 __ovld __cnfn floor(half3);
+half4 __ovld __cnfn floor(half4);
+half8 __ovld __cnfn floor(half8);
+half16 __ovld __cnfn floor(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the correctly rounded floating-point
+ * representation of the sum of c with the infinitely
+ * precise product of a and b. Rounding of
+ * intermediate products shall not occur. Edge case
+ * behavior is per the IEEE 754-2008 standard.
+ */
+float __ovld __cnfn fma(float a, float b, float c);
+float2 __ovld __cnfn fma(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn fma(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn fma(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn fma(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn fma(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fma(double a, double b, double c);
+double2 __ovld __cnfn fma(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn fma(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn fma(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn fma(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn fma(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fma(half a, half b, half c);
+half2 __ovld __cnfn fma(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn fma(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn fma(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn fma(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn fma(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if x < y, otherwise it returns x. If one
+ * argument is a NaN, fmax() returns the other
+ * argument. If both arguments are NaNs, fmax()
+ * returns a NaN.
+ */
+float __ovld __cnfn fmax(float x, float y);
+float2 __ovld __cnfn fmax(float2 x, float2 y);
+float3 __ovld __cnfn fmax(float3 x, float3 y);
+float4 __ovld __cnfn fmax(float4 x, float4 y);
+float8 __ovld __cnfn fmax(float8 x, float8 y);
+float16 __ovld __cnfn fmax(float16 x, float16 y);
+float2 __ovld __cnfn fmax(float2 x, float y);
+float3 __ovld __cnfn fmax(float3 x, float y);
+float4 __ovld __cnfn fmax(float4 x, float y);
+float8 __ovld __cnfn fmax(float8 x, float y);
+float16 __ovld __cnfn fmax(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmax(double x, double y);
+double2 __ovld __cnfn fmax(double2 x, double2 y);
+double3 __ovld __cnfn fmax(double3 x, double3 y);
+double4 __ovld __cnfn fmax(double4 x, double4 y);
+double8 __ovld __cnfn fmax(double8 x, double8 y);
+double16 __ovld __cnfn fmax(double16 x, double16 y);
+double2 __ovld __cnfn fmax(double2 x, double y);
+double3 __ovld __cnfn fmax(double3 x, double y);
+double4 __ovld __cnfn fmax(double4 x, double y);
+double8 __ovld __cnfn fmax(double8 x, double y);
+double16 __ovld __cnfn fmax(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmax(half x, half y);
+half2 __ovld __cnfn fmax(half2 x, half2 y);
+half3 __ovld __cnfn fmax(half3 x, half3 y);
+half4 __ovld __cnfn fmax(half4 x, half4 y);
+half8 __ovld __cnfn fmax(half8 x, half8 y);
+half16 __ovld __cnfn fmax(half16 x, half16 y);
+half2 __ovld __cnfn fmax(half2 x, half y);
+half3 __ovld __cnfn fmax(half3 x, half y);
+half4 __ovld __cnfn fmax(half4 x, half y);
+half8 __ovld __cnfn fmax(half8 x, half y);
+half16 __ovld __cnfn fmax(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if y < x, otherwise it returns x. If one
+ * argument is a NaN, fmin() returns the other
+ * argument. If both arguments are NaNs, fmin()
+ * returns a NaN.
+ */
+float __ovld __cnfn fmin(float x, float y);
+float2 __ovld __cnfn fmin(float2 x, float2 y);
+float3 __ovld __cnfn fmin(float3 x, float3 y);
+float4 __ovld __cnfn fmin(float4 x, float4 y);
+float8 __ovld __cnfn fmin(float8 x, float8 y);
+float16 __ovld __cnfn fmin(float16 x, float16 y);
+float2 __ovld __cnfn fmin(float2 x, float y);
+float3 __ovld __cnfn fmin(float3 x, float y);
+float4 __ovld __cnfn fmin(float4 x, float y);
+float8 __ovld __cnfn fmin(float8 x, float y);
+float16 __ovld __cnfn fmin(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmin(double x, double y);
+double2 __ovld __cnfn fmin(double2 x, double2 y);
+double3 __ovld __cnfn fmin(double3 x, double3 y);
+double4 __ovld __cnfn fmin(double4 x, double4 y);
+double8 __ovld __cnfn fmin(double8 x, double8 y);
+double16 __ovld __cnfn fmin(double16 x, double16 y);
+double2 __ovld __cnfn fmin(double2 x, double y);
+double3 __ovld __cnfn fmin(double3 x, double y);
+double4 __ovld __cnfn fmin(double4 x, double y);
+double8 __ovld __cnfn fmin(double8 x, double y);
+double16 __ovld __cnfn fmin(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmin(half x, half y);
+half2 __ovld __cnfn fmin(half2 x, half2 y);
+half3 __ovld __cnfn fmin(half3 x, half3 y);
+half4 __ovld __cnfn fmin(half4 x, half4 y);
+half8 __ovld __cnfn fmin(half8 x, half8 y);
+half16 __ovld __cnfn fmin(half16 x, half16 y);
+half2 __ovld __cnfn fmin(half2 x, half y);
+half3 __ovld __cnfn fmin(half3 x, half y);
+half4 __ovld __cnfn fmin(half4 x, half y);
+half8 __ovld __cnfn fmin(half8 x, half y);
+half16 __ovld __cnfn fmin(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Modulus. Returns x - y * trunc (x/y).
+ */
+float __ovld __cnfn fmod(float x, float y);
+float2 __ovld __cnfn fmod(float2 x, float2 y);
+float3 __ovld __cnfn fmod(float3 x, float3 y);
+float4 __ovld __cnfn fmod(float4 x, float4 y);
+float8 __ovld __cnfn fmod(float8 x, float8 y);
+float16 __ovld __cnfn fmod(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmod(double x, double y);
+double2 __ovld __cnfn fmod(double2 x, double2 y);
+double3 __ovld __cnfn fmod(double3 x, double3 y);
+double4 __ovld __cnfn fmod(double4 x, double4 y);
+double8 __ovld __cnfn fmod(double8 x, double8 y);
+double16 __ovld __cnfn fmod(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmod(half x, half y);
+half2 __ovld __cnfn fmod(half2 x, half2 y);
+half3 __ovld __cnfn fmod(half3 x, half3 y);
+half4 __ovld __cnfn fmod(half4 x, half4 y);
+half8 __ovld __cnfn fmod(half8 x, half8 y);
+half16 __ovld __cnfn fmod(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns fmin(x - floor (x), 0x1.fffffep-1f ).
+ * floor(x) is returned in iptr.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld fract(float x, float *iptr);
+float2 __ovld fract(float2 x, float2 *iptr);
+float3 __ovld fract(float3 x, float3 *iptr);
+float4 __ovld fract(float4 x, float4 *iptr);
+float8 __ovld fract(float8 x, float8 *iptr);
+float16 __ovld fract(float16 x, float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld fract(double x, double *iptr);
+double2 __ovld fract(double2 x, double2 *iptr);
+double3 __ovld fract(double3 x, double3 *iptr);
+double4 __ovld fract(double4 x, double4 *iptr);
+double8 __ovld fract(double8 x, double8 *iptr);
+double16 __ovld fract(double16 x, double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld fract(half x, half *iptr);
+half2 __ovld fract(half2 x, half2 *iptr);
+half3 __ovld fract(half3 x, half3 *iptr);
+half4 __ovld fract(half4 x, half4 *iptr);
+half8 __ovld fract(half8 x, half8 *iptr);
+half16 __ovld fract(half16 x, half16 *iptr);
+#endif //cl_khr_fp16
+#else
+float __ovld fract(float x, __global float *iptr);
+float2 __ovld fract(float2 x, __global float2 *iptr);
+float3 __ovld fract(float3 x, __global float3 *iptr);
+float4 __ovld fract(float4 x, __global float4 *iptr);
+float8 __ovld fract(float8 x, __global float8 *iptr);
+float16 __ovld fract(float16 x, __global float16 *iptr);
+float __ovld fract(float x, __local float *iptr);
+float2 __ovld fract(float2 x, __local float2 *iptr);
+float3 __ovld fract(float3 x, __local float3 *iptr);
+float4 __ovld fract(float4 x, __local float4 *iptr);
+float8 __ovld fract(float8 x, __local float8 *iptr);
+float16 __ovld fract(float16 x, __local float16 *iptr);
+float __ovld fract(float x, __private float *iptr);
+float2 __ovld fract(float2 x, __private float2 *iptr);
+float3 __ovld fract(float3 x, __private float3 *iptr);
+float4 __ovld fract(float4 x, __private float4 *iptr);
+float8 __ovld fract(float8 x, __private float8 *iptr);
+float16 __ovld fract(float16 x, __private float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld fract(double x, __global double *iptr);
+double2 __ovld fract(double2 x, __global double2 *iptr);
+double3 __ovld fract(double3 x, __global double3 *iptr);
+double4 __ovld fract(double4 x, __global double4 *iptr);
+double8 __ovld fract(double8 x, __global double8 *iptr);
+double16 __ovld fract(double16 x, __global double16 *iptr);
+double __ovld fract(double x, __local double *iptr);
+double2 __ovld fract(double2 x, __local double2 *iptr);
+double3 __ovld fract(double3 x, __local double3 *iptr);
+double4 __ovld fract(double4 x, __local double4 *iptr);
+double8 __ovld fract(double8 x, __local double8 *iptr);
+double16 __ovld fract(double16 x, __local double16 *iptr);
+double __ovld fract(double x, __private double *iptr);
+double2 __ovld fract(double2 x, __private double2 *iptr);
+double3 __ovld fract(double3 x, __private double3 *iptr);
+double4 __ovld fract(double4 x, __private double4 *iptr);
+double8 __ovld fract(double8 x, __private double8 *iptr);
+double16 __ovld fract(double16 x, __private double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld fract(half x, __global half *iptr);
+half2 __ovld fract(half2 x, __global half2 *iptr);
+half3 __ovld fract(half3 x, __global half3 *iptr);
+half4 __ovld fract(half4 x, __global half4 *iptr);
+half8 __ovld fract(half8 x, __global half8 *iptr);
+half16 __ovld fract(half16 x, __global half16 *iptr);
+half __ovld fract(half x, __local half *iptr);
+half2 __ovld fract(half2 x, __local half2 *iptr);
+half3 __ovld fract(half3 x, __local half3 *iptr);
+half4 __ovld fract(half4 x, __local half4 *iptr);
+half8 __ovld fract(half8 x, __local half8 *iptr);
+half16 __ovld fract(half16 x, __local half16 *iptr);
+half __ovld fract(half x, __private half *iptr);
+half2 __ovld fract(half2 x, __private half2 *iptr);
+half3 __ovld fract(half3 x, __private half3 *iptr);
+half4 __ovld fract(half4 x, __private half4 *iptr);
+half8 __ovld fract(half8 x, __private half8 *iptr);
+half16 __ovld fract(half16 x, __private half16 *iptr);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Extract mantissa and exponent from x. For each
+ * component the mantissa returned is a float with
+ * magnitude in the interval [1/2, 1) or 0. Each
+ * component of x equals mantissa returned * 2^exp.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld frexp(float x, int *exp);
+float2 __ovld frexp(float2 x, int2 *exp);
+float3 __ovld frexp(float3 x, int3 *exp);
+float4 __ovld frexp(float4 x, int4 *exp);
+float8 __ovld frexp(float8 x, int8 *exp);
+float16 __ovld frexp(float16 x, int16 *exp);
+#ifdef cl_khr_fp64
+double __ovld frexp(double x, int *exp);
+double2 __ovld frexp(double2 x, int2 *exp);
+double3 __ovld frexp(double3 x, int3 *exp);
+double4 __ovld frexp(double4 x, int4 *exp);
+double8 __ovld frexp(double8 x, int8 *exp);
+double16 __ovld frexp(double16 x, int16 *exp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld frexp(half x, int *exp);
+half2 __ovld frexp(half2 x, int2 *exp);
+half3 __ovld frexp(half3 x, int3 *exp);
+half4 __ovld frexp(half4 x, int4 *exp);
+half8 __ovld frexp(half8 x, int8 *exp);
+half16 __ovld frexp(half16 x, int16 *exp);
+#endif //cl_khr_fp16
+#else
+float __ovld frexp(float x, __global int *exp);
+float2 __ovld frexp(float2 x, __global int2 *exp);
+float3 __ovld frexp(float3 x, __global int3 *exp);
+float4 __ovld frexp(float4 x, __global int4 *exp);
+float8 __ovld frexp(float8 x, __global int8 *exp);
+float16 __ovld frexp(float16 x, __global int16 *exp);
+float __ovld frexp(float x, __local int *exp);
+float2 __ovld frexp(float2 x, __local int2 *exp);
+float3 __ovld frexp(float3 x, __local int3 *exp);
+float4 __ovld frexp(float4 x, __local int4 *exp);
+float8 __ovld frexp(float8 x, __local int8 *exp);
+float16 __ovld frexp(float16 x, __local int16 *exp);
+float __ovld frexp(float x, __private int *exp);
+float2 __ovld frexp(float2 x, __private int2 *exp);
+float3 __ovld frexp(float3 x, __private int3 *exp);
+float4 __ovld frexp(float4 x, __private int4 *exp);
+float8 __ovld frexp(float8 x, __private int8 *exp);
+float16 __ovld frexp(float16 x, __private int16 *exp);
+#ifdef cl_khr_fp64
+double __ovld frexp(double x, __global int *exp);
+double2 __ovld frexp(double2 x, __global int2 *exp);
+double3 __ovld frexp(double3 x, __global int3 *exp);
+double4 __ovld frexp(double4 x, __global int4 *exp);
+double8 __ovld frexp(double8 x, __global int8 *exp);
+double16 __ovld frexp(double16 x, __global int16 *exp);
+double __ovld frexp(double x, __local int *exp);
+double2 __ovld frexp(double2 x, __local int2 *exp);
+double3 __ovld frexp(double3 x, __local int3 *exp);
+double4 __ovld frexp(double4 x, __local int4 *exp);
+double8 __ovld frexp(double8 x, __local int8 *exp);
+double16 __ovld frexp(double16 x, __local int16 *exp);
+double __ovld frexp(double x, __private int *exp);
+double2 __ovld frexp(double2 x, __private int2 *exp);
+double3 __ovld frexp(double3 x, __private int3 *exp);
+double4 __ovld frexp(double4 x, __private int4 *exp);
+double8 __ovld frexp(double8 x, __private int8 *exp);
+double16 __ovld frexp(double16 x, __private int16 *exp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld frexp(half x, __global int *exp);
+half2 __ovld frexp(half2 x, __global int2 *exp);
+half3 __ovld frexp(half3 x, __global int3 *exp);
+half4 __ovld frexp(half4 x, __global int4 *exp);
+half8 __ovld frexp(half8 x, __global int8 *exp);
+half16 __ovld frexp(half16 x, __global int16 *exp);
+half __ovld frexp(half x, __local int *exp);
+half2 __ovld frexp(half2 x, __local int2 *exp);
+half3 __ovld frexp(half3 x, __local int3 *exp);
+half4 __ovld frexp(half4 x, __local int4 *exp);
+half8 __ovld frexp(half8 x, __local int8 *exp);
+half16 __ovld frexp(half16 x, __local int16 *exp);
+half __ovld frexp(half x, __private int *exp);
+half2 __ovld frexp(half2 x, __private int2 *exp);
+half3 __ovld frexp(half3 x, __private int3 *exp);
+half4 __ovld frexp(half4 x, __private int4 *exp);
+half8 __ovld frexp(half8 x, __private int8 *exp);
+half16 __ovld frexp(half16 x, __private int16 *exp);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute the value of the square root of x^2 + y^2
+ * without undue overflow or underflow.
+ */
+float __ovld __cnfn hypot(float x, float y);
+float2 __ovld __cnfn hypot(float2 x, float2 y);
+float3 __ovld __cnfn hypot(float3 x, float3 y);
+float4 __ovld __cnfn hypot(float4 x, float4 y);
+float8 __ovld __cnfn hypot(float8 x, float8 y);
+float16 __ovld __cnfn hypot(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn hypot(double x, double y);
+double2 __ovld __cnfn hypot(double2 x, double2 y);
+double3 __ovld __cnfn hypot(double3 x, double3 y);
+double4 __ovld __cnfn hypot(double4 x, double4 y);
+double8 __ovld __cnfn hypot(double8 x, double8 y);
+double16 __ovld __cnfn hypot(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn hypot(half x, half y);
+half2 __ovld __cnfn hypot(half2 x, half2 y);
+half3 __ovld __cnfn hypot(half3 x, half3 y);
+half4 __ovld __cnfn hypot(half4 x, half4 y);
+half8 __ovld __cnfn hypot(half8 x, half8 y);
+half16 __ovld __cnfn hypot(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Return the exponent as an integer value.
+ */
+int __ovld __cnfn ilogb(float x);
+int2 __ovld __cnfn ilogb(float2 x);
+int3 __ovld __cnfn ilogb(float3 x);
+int4 __ovld __cnfn ilogb(float4 x);
+int8 __ovld __cnfn ilogb(float8 x);
+int16 __ovld __cnfn ilogb(float16 x);
+#ifdef cl_khr_fp64
+int __ovld __cnfn ilogb(double x);
+int2 __ovld __cnfn ilogb(double2 x);
+int3 __ovld __cnfn ilogb(double3 x);
+int4 __ovld __cnfn ilogb(double4 x);
+int8 __ovld __cnfn ilogb(double8 x);
+int16 __ovld __cnfn ilogb(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn ilogb(half x);
+int2 __ovld __cnfn ilogb(half2 x);
+int3 __ovld __cnfn ilogb(half3 x);
+int4 __ovld __cnfn ilogb(half4 x);
+int8 __ovld __cnfn ilogb(half8 x);
+int16 __ovld __cnfn ilogb(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Multiply x by 2 to the power n.
+ */
+float __ovld __cnfn ldexp(float x, int n);
+float2 __ovld __cnfn ldexp(float2 x, int2 n);
+float3 __ovld __cnfn ldexp(float3 x, int3 n);
+float4 __ovld __cnfn ldexp(float4 x, int4 n);
+float8 __ovld __cnfn ldexp(float8 x, int8 n);
+float16 __ovld __cnfn ldexp(float16 x, int16 n);
+float2 __ovld __cnfn ldexp(float2 x, int n);
+float3 __ovld __cnfn ldexp(float3 x, int n);
+float4 __ovld __cnfn ldexp(float4 x, int n);
+float8 __ovld __cnfn ldexp(float8 x, int n);
+float16 __ovld __cnfn ldexp(float16 x, int n);
+#ifdef cl_khr_fp64
+double __ovld __cnfn ldexp(double x, int n);
+double2 __ovld __cnfn ldexp(double2 x, int2 n);
+double3 __ovld __cnfn ldexp(double3 x, int3 n);
+double4 __ovld __cnfn ldexp(double4 x, int4 n);
+double8 __ovld __cnfn ldexp(double8 x, int8 n);
+double16 __ovld __cnfn ldexp(double16 x, int16 n);
+double2 __ovld __cnfn ldexp(double2 x, int n);
+double3 __ovld __cnfn ldexp(double3 x, int n);
+double4 __ovld __cnfn ldexp(double4 x, int n);
+double8 __ovld __cnfn ldexp(double8 x, int n);
+double16 __ovld __cnfn ldexp(double16 x, int n);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn ldexp(half x, int n);
+half2 __ovld __cnfn ldexp(half2 x, int2 n);
+half3 __ovld __cnfn ldexp(half3 x, int3 n);
+half4 __ovld __cnfn ldexp(half4 x, int4 n);
+half8 __ovld __cnfn ldexp(half8 x, int8 n);
+half16 __ovld __cnfn ldexp(half16 x, int16 n);
+half2 __ovld __cnfn ldexp(half2 x, int n);
+half3 __ovld __cnfn ldexp(half3 x, int n);
+half4 __ovld __cnfn ldexp(half4 x, int n);
+half8 __ovld __cnfn ldexp(half8 x, int n);
+half16 __ovld __cnfn ldexp(half16 x, int n);
+#endif //cl_khr_fp16
+
+/**
+ * Log gamma function. Returns the natural
+ * logarithm of the absolute value of the gamma
+ * function. The sign of the gamma function is
+ * returned in the signp argument of lgamma_r.
+ */
+float __ovld __cnfn lgamma(float x);
+float2 __ovld __cnfn lgamma(float2 x);
+float3 __ovld __cnfn lgamma(float3 x);
+float4 __ovld __cnfn lgamma(float4 x);
+float8 __ovld __cnfn lgamma(float8 x);
+float16 __ovld __cnfn lgamma(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn lgamma(double x);
+double2 __ovld __cnfn lgamma(double2 x);
+double3 __ovld __cnfn lgamma(double3 x);
+double4 __ovld __cnfn lgamma(double4 x);
+double8 __ovld __cnfn lgamma(double8 x);
+double16 __ovld __cnfn lgamma(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn lgamma(half x);
+half2 __ovld __cnfn lgamma(half2 x);
+half3 __ovld __cnfn lgamma(half3 x);
+half4 __ovld __cnfn lgamma(half4 x);
+half8 __ovld __cnfn lgamma(half8 x);
+half16 __ovld __cnfn lgamma(half16 x);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld lgamma_r(float x, int *signp);
+float2 __ovld lgamma_r(float2 x, int2 *signp);
+float3 __ovld lgamma_r(float3 x, int3 *signp);
+float4 __ovld lgamma_r(float4 x, int4 *signp);
+float8 __ovld lgamma_r(float8 x, int8 *signp);
+float16 __ovld lgamma_r(float16 x, int16 *signp);
+#ifdef cl_khr_fp64
+double __ovld lgamma_r(double x, int *signp);
+double2 __ovld lgamma_r(double2 x, int2 *signp);
+double3 __ovld lgamma_r(double3 x, int3 *signp);
+double4 __ovld lgamma_r(double4 x, int4 *signp);
+double8 __ovld lgamma_r(double8 x, int8 *signp);
+double16 __ovld lgamma_r(double16 x, int16 *signp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld lgamma_r(half x, int *signp);
+half2 __ovld lgamma_r(half2 x, int2 *signp);
+half3 __ovld lgamma_r(half3 x, int3 *signp);
+half4 __ovld lgamma_r(half4 x, int4 *signp);
+half8 __ovld lgamma_r(half8 x, int8 *signp);
+half16 __ovld lgamma_r(half16 x, int16 *signp);
+#endif //cl_khr_fp16
+#else
+float __ovld lgamma_r(float x, __global int *signp);
+float2 __ovld lgamma_r(float2 x, __global int2 *signp);
+float3 __ovld lgamma_r(float3 x, __global int3 *signp);
+float4 __ovld lgamma_r(float4 x, __global int4 *signp);
+float8 __ovld lgamma_r(float8 x, __global int8 *signp);
+float16 __ovld lgamma_r(float16 x, __global int16 *signp);
+float __ovld lgamma_r(float x, __local int *signp);
+float2 __ovld lgamma_r(float2 x, __local int2 *signp);
+float3 __ovld lgamma_r(float3 x, __local int3 *signp);
+float4 __ovld lgamma_r(float4 x, __local int4 *signp);
+float8 __ovld lgamma_r(float8 x, __local int8 *signp);
+float16 __ovld lgamma_r(float16 x, __local int16 *signp);
+float __ovld lgamma_r(float x, __private int *signp);
+float2 __ovld lgamma_r(float2 x, __private int2 *signp);
+float3 __ovld lgamma_r(float3 x, __private int3 *signp);
+float4 __ovld lgamma_r(float4 x, __private int4 *signp);
+float8 __ovld lgamma_r(float8 x, __private int8 *signp);
+float16 __ovld lgamma_r(float16 x, __private int16 *signp);
+#ifdef cl_khr_fp64
+double __ovld lgamma_r(double x, __global int *signp);
+double2 __ovld lgamma_r(double2 x, __global int2 *signp);
+double3 __ovld lgamma_r(double3 x, __global int3 *signp);
+double4 __ovld lgamma_r(double4 x, __global int4 *signp);
+double8 __ovld lgamma_r(double8 x, __global int8 *signp);
+double16 __ovld lgamma_r(double16 x, __global int16 *signp);
+double __ovld lgamma_r(double x, __local int *signp);
+double2 __ovld lgamma_r(double2 x, __local int2 *signp);
+double3 __ovld lgamma_r(double3 x, __local int3 *signp);
+double4 __ovld lgamma_r(double4 x, __local int4 *signp);
+double8 __ovld lgamma_r(double8 x, __local int8 *signp);
+double16 __ovld lgamma_r(double16 x, __local int16 *signp);
+double __ovld lgamma_r(double x, __private int *signp);
+double2 __ovld lgamma_r(double2 x, __private int2 *signp);
+double3 __ovld lgamma_r(double3 x, __private int3 *signp);
+double4 __ovld lgamma_r(double4 x, __private int4 *signp);
+double8 __ovld lgamma_r(double8 x, __private int8 *signp);
+double16 __ovld lgamma_r(double16 x, __private int16 *signp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld lgamma_r(half x, __global int *signp);
+half2 __ovld lgamma_r(half2 x, __global int2 *signp);
+half3 __ovld lgamma_r(half3 x, __global int3 *signp);
+half4 __ovld lgamma_r(half4 x, __global int4 *signp);
+half8 __ovld lgamma_r(half8 x, __global int8 *signp);
+half16 __ovld lgamma_r(half16 x, __global int16 *signp);
+half __ovld lgamma_r(half x, __local int *signp);
+half2 __ovld lgamma_r(half2 x, __local int2 *signp);
+half3 __ovld lgamma_r(half3 x, __local int3 *signp);
+half4 __ovld lgamma_r(half4 x, __local int4 *signp);
+half8 __ovld lgamma_r(half8 x, __local int8 *signp);
+half16 __ovld lgamma_r(half16 x, __local int16 *signp);
+half __ovld lgamma_r(half x, __private int *signp);
+half2 __ovld lgamma_r(half2 x, __private int2 *signp);
+half3 __ovld lgamma_r(half3 x, __private int3 *signp);
+half4 __ovld lgamma_r(half4 x, __private int4 *signp);
+half8 __ovld lgamma_r(half8 x, __private int8 *signp);
+half16 __ovld lgamma_r(half16 x, __private int16 *signp);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute natural logarithm.
+ */
+float __ovld __cnfn log(float);
+float2 __ovld __cnfn log(float2);
+float3 __ovld __cnfn log(float3);
+float4 __ovld __cnfn log(float4);
+float8 __ovld __cnfn log(float8);
+float16 __ovld __cnfn log(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log(double);
+double2 __ovld __cnfn log(double2);
+double3 __ovld __cnfn log(double3);
+double4 __ovld __cnfn log(double4);
+double8 __ovld __cnfn log(double8);
+double16 __ovld __cnfn log(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log(half);
+half2 __ovld __cnfn log(half2);
+half3 __ovld __cnfn log(half3);
+half4 __ovld __cnfn log(half4);
+half8 __ovld __cnfn log(half8);
+half16 __ovld __cnfn log(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base 2 logarithm.
+ */
+float __ovld __cnfn log2(float);
+float2 __ovld __cnfn log2(float2);
+float3 __ovld __cnfn log2(float3);
+float4 __ovld __cnfn log2(float4);
+float8 __ovld __cnfn log2(float8);
+float16 __ovld __cnfn log2(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log2(double);
+double2 __ovld __cnfn log2(double2);
+double3 __ovld __cnfn log2(double3);
+double4 __ovld __cnfn log2(double4);
+double8 __ovld __cnfn log2(double8);
+double16 __ovld __cnfn log2(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log2(half);
+half2 __ovld __cnfn log2(half2);
+half3 __ovld __cnfn log2(half3);
+half4 __ovld __cnfn log2(half4);
+half8 __ovld __cnfn log2(half8);
+half16 __ovld __cnfn log2(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base 10 logarithm.
+ */
+float __ovld __cnfn log10(float);
+float2 __ovld __cnfn log10(float2);
+float3 __ovld __cnfn log10(float3);
+float4 __ovld __cnfn log10(float4);
+float8 __ovld __cnfn log10(float8);
+float16 __ovld __cnfn log10(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log10(double);
+double2 __ovld __cnfn log10(double2);
+double3 __ovld __cnfn log10(double3);
+double4 __ovld __cnfn log10(double4);
+double8 __ovld __cnfn log10(double8);
+double16 __ovld __cnfn log10(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log10(half);
+half2 __ovld __cnfn log10(half2);
+half3 __ovld __cnfn log10(half3);
+half4 __ovld __cnfn log10(half4);
+half8 __ovld __cnfn log10(half8);
+half16 __ovld __cnfn log10(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base e logarithm of (1.0 + x).
+ */
+float __ovld __cnfn log1p(float x);
+float2 __ovld __cnfn log1p(float2 x);
+float3 __ovld __cnfn log1p(float3 x);
+float4 __ovld __cnfn log1p(float4 x);
+float8 __ovld __cnfn log1p(float8 x);
+float16 __ovld __cnfn log1p(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log1p(double x);
+double2 __ovld __cnfn log1p(double2 x);
+double3 __ovld __cnfn log1p(double3 x);
+double4 __ovld __cnfn log1p(double4 x);
+double8 __ovld __cnfn log1p(double8 x);
+double16 __ovld __cnfn log1p(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log1p(half x);
+half2 __ovld __cnfn log1p(half2 x);
+half3 __ovld __cnfn log1p(half3 x);
+half4 __ovld __cnfn log1p(half4 x);
+half8 __ovld __cnfn log1p(half8 x);
+half16 __ovld __cnfn log1p(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the exponent of x, which is the integral
+ * part of logr | x |.
+ */
+float __ovld __cnfn logb(float x);
+float2 __ovld __cnfn logb(float2 x);
+float3 __ovld __cnfn logb(float3 x);
+float4 __ovld __cnfn logb(float4 x);
+float8 __ovld __cnfn logb(float8 x);
+float16 __ovld __cnfn logb(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn logb(double x);
+double2 __ovld __cnfn logb(double2 x);
+double3 __ovld __cnfn logb(double3 x);
+double4 __ovld __cnfn logb(double4 x);
+double8 __ovld __cnfn logb(double8 x);
+double16 __ovld __cnfn logb(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn logb(half x);
+half2 __ovld __cnfn logb(half2 x);
+half3 __ovld __cnfn logb(half3 x);
+half4 __ovld __cnfn logb(half4 x);
+half8 __ovld __cnfn logb(half8 x);
+half16 __ovld __cnfn logb(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * mad approximates a * b + c. Whether or how the
+ * product of a * b is rounded and how supernormal or
+ * subnormal intermediate products are handled is not
+ * defined. mad is intended to be used where speed is
+ * preferred over accuracy.
+ */
+float __ovld __cnfn mad(float a, float b, float c);
+float2 __ovld __cnfn mad(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn mad(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn mad(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn mad(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn mad(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn mad(double a, double b, double c);
+double2 __ovld __cnfn mad(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn mad(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn mad(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn mad(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn mad(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn mad(half a, half b, half c);
+half2 __ovld __cnfn mad(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn mad(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn mad(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn mad(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn mad(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x if | x | > | y |, y if | y | > | x |, otherwise
+ * fmax(x, y).
+ */
+float __ovld __cnfn maxmag(float x, float y);
+float2 __ovld __cnfn maxmag(float2 x, float2 y);
+float3 __ovld __cnfn maxmag(float3 x, float3 y);
+float4 __ovld __cnfn maxmag(float4 x, float4 y);
+float8 __ovld __cnfn maxmag(float8 x, float8 y);
+float16 __ovld __cnfn maxmag(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn maxmag(double x, double y);
+double2 __ovld __cnfn maxmag(double2 x, double2 y);
+double3 __ovld __cnfn maxmag(double3 x, double3 y);
+double4 __ovld __cnfn maxmag(double4 x, double4 y);
+double8 __ovld __cnfn maxmag(double8 x, double8 y);
+double16 __ovld __cnfn maxmag(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn maxmag(half x, half y);
+half2 __ovld __cnfn maxmag(half2 x, half2 y);
+half3 __ovld __cnfn maxmag(half3 x, half3 y);
+half4 __ovld __cnfn maxmag(half4 x, half4 y);
+half8 __ovld __cnfn maxmag(half8 x, half8 y);
+half16 __ovld __cnfn maxmag(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x if | x | < | y |, y if | y | < | x |, otherwise
+ * fmin(x, y).
+ */
+float __ovld __cnfn minmag(float x, float y);
+float2 __ovld __cnfn minmag(float2 x, float2 y);
+float3 __ovld __cnfn minmag(float3 x, float3 y);
+float4 __ovld __cnfn minmag(float4 x, float4 y);
+float8 __ovld __cnfn minmag(float8 x, float8 y);
+float16 __ovld __cnfn minmag(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn minmag(double x, double y);
+double2 __ovld __cnfn minmag(double2 x, double2 y);
+double3 __ovld __cnfn minmag(double3 x, double3 y);
+double4 __ovld __cnfn minmag(double4 x, double4 y);
+double8 __ovld __cnfn minmag(double8 x, double8 y);
+double16 __ovld __cnfn minmag(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn minmag(half x, half y);
+half2 __ovld __cnfn minmag(half2 x, half2 y);
+half3 __ovld __cnfn minmag(half3 x, half3 y);
+half4 __ovld __cnfn minmag(half4 x, half4 y);
+half8 __ovld __cnfn minmag(half8 x, half8 y);
+half16 __ovld __cnfn minmag(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Decompose a floating-point number. The modf
+ * function breaks the argument x into integral and
+ * fractional parts, each of which has the same sign as
+ * the argument. It stores the integral part in the object
+ * pointed to by iptr.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld modf(float x, float *iptr);
+float2 __ovld modf(float2 x, float2 *iptr);
+float3 __ovld modf(float3 x, float3 *iptr);
+float4 __ovld modf(float4 x, float4 *iptr);
+float8 __ovld modf(float8 x, float8 *iptr);
+float16 __ovld modf(float16 x, float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld modf(double x, double *iptr);
+double2 __ovld modf(double2 x, double2 *iptr);
+double3 __ovld modf(double3 x, double3 *iptr);
+double4 __ovld modf(double4 x, double4 *iptr);
+double8 __ovld modf(double8 x, double8 *iptr);
+double16 __ovld modf(double16 x, double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld modf(half x, half *iptr);
+half2 __ovld modf(half2 x, half2 *iptr);
+half3 __ovld modf(half3 x, half3 *iptr);
+half4 __ovld modf(half4 x, half4 *iptr);
+half8 __ovld modf(half8 x, half8 *iptr);
+half16 __ovld modf(half16 x, half16 *iptr);
+#endif //cl_khr_fp16
+#else
+float __ovld modf(float x, __global float *iptr);
+float2 __ovld modf(float2 x, __global float2 *iptr);
+float3 __ovld modf(float3 x, __global float3 *iptr);
+float4 __ovld modf(float4 x, __global float4 *iptr);
+float8 __ovld modf(float8 x, __global float8 *iptr);
+float16 __ovld modf(float16 x, __global float16 *iptr);
+float __ovld modf(float x, __local float *iptr);
+float2 __ovld modf(float2 x, __local float2 *iptr);
+float3 __ovld modf(float3 x, __local float3 *iptr);
+float4 __ovld modf(float4 x, __local float4 *iptr);
+float8 __ovld modf(float8 x, __local float8 *iptr);
+float16 __ovld modf(float16 x, __local float16 *iptr);
+float __ovld modf(float x, __private float *iptr);
+float2 __ovld modf(float2 x, __private float2 *iptr);
+float3 __ovld modf(float3 x, __private float3 *iptr);
+float4 __ovld modf(float4 x, __private float4 *iptr);
+float8 __ovld modf(float8 x, __private float8 *iptr);
+float16 __ovld modf(float16 x, __private float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld modf(double x, __global double *iptr);
+double2 __ovld modf(double2 x, __global double2 *iptr);
+double3 __ovld modf(double3 x, __global double3 *iptr);
+double4 __ovld modf(double4 x, __global double4 *iptr);
+double8 __ovld modf(double8 x, __global double8 *iptr);
+double16 __ovld modf(double16 x, __global double16 *iptr);
+double __ovld modf(double x, __local double *iptr);
+double2 __ovld modf(double2 x, __local double2 *iptr);
+double3 __ovld modf(double3 x, __local double3 *iptr);
+double4 __ovld modf(double4 x, __local double4 *iptr);
+double8 __ovld modf(double8 x, __local double8 *iptr);
+double16 __ovld modf(double16 x, __local double16 *iptr);
+double __ovld modf(double x, __private double *iptr);
+double2 __ovld modf(double2 x, __private double2 *iptr);
+double3 __ovld modf(double3 x, __private double3 *iptr);
+double4 __ovld modf(double4 x, __private double4 *iptr);
+double8 __ovld modf(double8 x, __private double8 *iptr);
+double16 __ovld modf(double16 x, __private double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld modf(half x, __global half *iptr);
+half2 __ovld modf(half2 x, __global half2 *iptr);
+half3 __ovld modf(half3 x, __global half3 *iptr);
+half4 __ovld modf(half4 x, __global half4 *iptr);
+half8 __ovld modf(half8 x, __global half8 *iptr);
+half16 __ovld modf(half16 x, __global half16 *iptr);
+half __ovld modf(half x, __local half *iptr);
+half2 __ovld modf(half2 x, __local half2 *iptr);
+half3 __ovld modf(half3 x, __local half3 *iptr);
+half4 __ovld modf(half4 x, __local half4 *iptr);
+half8 __ovld modf(half8 x, __local half8 *iptr);
+half16 __ovld modf(half16 x, __local half16 *iptr);
+half __ovld modf(half x, __private half *iptr);
+half2 __ovld modf(half2 x, __private half2 *iptr);
+half3 __ovld modf(half3 x, __private half3 *iptr);
+half4 __ovld modf(half4 x, __private half4 *iptr);
+half8 __ovld modf(half8 x, __private half8 *iptr);
+half16 __ovld modf(half16 x, __private half16 *iptr);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Returns a quiet NaN. The nancode may be placed
+ * in the significand of the resulting NaN.
+ */
+float __ovld __cnfn nan(uint nancode);
+float2 __ovld __cnfn nan(uint2 nancode);
+float3 __ovld __cnfn nan(uint3 nancode);
+float4 __ovld __cnfn nan(uint4 nancode);
+float8 __ovld __cnfn nan(uint8 nancode);
+float16 __ovld __cnfn nan(uint16 nancode);
+#ifdef cl_khr_fp64
+double __ovld __cnfn nan(ulong nancode);
+double2 __ovld __cnfn nan(ulong2 nancode);
+double3 __ovld __cnfn nan(ulong3 nancode);
+double4 __ovld __cnfn nan(ulong4 nancode);
+double8 __ovld __cnfn nan(ulong8 nancode);
+double16 __ovld __cnfn nan(ulong16 nancode);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn nan(ushort nancode);
+half2 __ovld __cnfn nan(ushort2 nancode);
+half3 __ovld __cnfn nan(ushort3 nancode);
+half4 __ovld __cnfn nan(ushort4 nancode);
+half8 __ovld __cnfn nan(ushort8 nancode);
+half16 __ovld __cnfn nan(ushort16 nancode);
+#endif //cl_khr_fp16
+
+/**
+ * Computes the next representable single-precision
+ * floating-point value following x in the direction of
+ * y. Thus, if y is less than x, nextafter() returns the
+ * largest representable floating-point number less
+ * than x.
+ */
+float __ovld __cnfn nextafter(float x, float y);
+float2 __ovld __cnfn nextafter(float2 x, float2 y);
+float3 __ovld __cnfn nextafter(float3 x, float3 y);
+float4 __ovld __cnfn nextafter(float4 x, float4 y);
+float8 __ovld __cnfn nextafter(float8 x, float8 y);
+float16 __ovld __cnfn nextafter(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn nextafter(double x, double y);
+double2 __ovld __cnfn nextafter(double2 x, double2 y);
+double3 __ovld __cnfn nextafter(double3 x, double3 y);
+double4 __ovld __cnfn nextafter(double4 x, double4 y);
+double8 __ovld __cnfn nextafter(double8 x, double8 y);
+double16 __ovld __cnfn nextafter(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn nextafter(half x, half y);
+half2 __ovld __cnfn nextafter(half2 x, half2 y);
+half3 __ovld __cnfn nextafter(half3 x, half3 y);
+half4 __ovld __cnfn nextafter(half4 x, half4 y);
+half8 __ovld __cnfn nextafter(half8 x, half8 y);
+half16 __ovld __cnfn nextafter(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y.
+ */
+float __ovld __cnfn pow(float x, float y);
+float2 __ovld __cnfn pow(float2 x, float2 y);
+float3 __ovld __cnfn pow(float3 x, float3 y);
+float4 __ovld __cnfn pow(float4 x, float4 y);
+float8 __ovld __cnfn pow(float8 x, float8 y);
+float16 __ovld __cnfn pow(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn pow(double x, double y);
+double2 __ovld __cnfn pow(double2 x, double2 y);
+double3 __ovld __cnfn pow(double3 x, double3 y);
+double4 __ovld __cnfn pow(double4 x, double4 y);
+double8 __ovld __cnfn pow(double8 x, double8 y);
+double16 __ovld __cnfn pow(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn pow(half x, half y);
+half2 __ovld __cnfn pow(half2 x, half2 y);
+half3 __ovld __cnfn pow(half3 x, half3 y);
+half4 __ovld __cnfn pow(half4 x, half4 y);
+half8 __ovld __cnfn pow(half8 x, half8 y);
+half16 __ovld __cnfn pow(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y, where y is an integer.
+ */
+float __ovld __cnfn pown(float x, int y);
+float2 __ovld __cnfn pown(float2 x, int2 y);
+float3 __ovld __cnfn pown(float3 x, int3 y);
+float4 __ovld __cnfn pown(float4 x, int4 y);
+float8 __ovld __cnfn pown(float8 x, int8 y);
+float16 __ovld __cnfn pown(float16 x, int16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn pown(double x, int y);
+double2 __ovld __cnfn pown(double2 x, int2 y);
+double3 __ovld __cnfn pown(double3 x, int3 y);
+double4 __ovld __cnfn pown(double4 x, int4 y);
+double8 __ovld __cnfn pown(double8 x, int8 y);
+double16 __ovld __cnfn pown(double16 x, int16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn pown(half x, int y);
+half2 __ovld __cnfn pown(half2 x, int2 y);
+half3 __ovld __cnfn pown(half3 x, int3 y);
+half4 __ovld __cnfn pown(half4 x, int4 y);
+half8 __ovld __cnfn pown(half8 x, int8 y);
+half16 __ovld __cnfn pown(half16 x, int16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y, where x is >= 0.
+ */
+float __ovld __cnfn powr(float x, float y);
+float2 __ovld __cnfn powr(float2 x, float2 y);
+float3 __ovld __cnfn powr(float3 x, float3 y);
+float4 __ovld __cnfn powr(float4 x, float4 y);
+float8 __ovld __cnfn powr(float8 x, float8 y);
+float16 __ovld __cnfn powr(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn powr(double x, double y);
+double2 __ovld __cnfn powr(double2 x, double2 y);
+double3 __ovld __cnfn powr(double3 x, double3 y);
+double4 __ovld __cnfn powr(double4 x, double4 y);
+double8 __ovld __cnfn powr(double8 x, double8 y);
+double16 __ovld __cnfn powr(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn powr(half x, half y);
+half2 __ovld __cnfn powr(half2 x, half2 y);
+half3 __ovld __cnfn powr(half3 x, half3 y);
+half4 __ovld __cnfn powr(half4 x, half4 y);
+half8 __ovld __cnfn powr(half8 x, half8 y);
+half16 __ovld __cnfn powr(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the value r such that r = x - n*y, where n
+ * is the integer nearest the exact value of x/y. If there
+ * are two integers closest to x/y, n shall be the even
+ * one. If r is zero, it is given the same sign as x.
+ */
+float __ovld __cnfn remainder(float x, float y);
+float2 __ovld __cnfn remainder(float2 x, float2 y);
+float3 __ovld __cnfn remainder(float3 x, float3 y);
+float4 __ovld __cnfn remainder(float4 x, float4 y);
+float8 __ovld __cnfn remainder(float8 x, float8 y);
+float16 __ovld __cnfn remainder(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn remainder(double x, double y);
+double2 __ovld __cnfn remainder(double2 x, double2 y);
+double3 __ovld __cnfn remainder(double3 x, double3 y);
+double4 __ovld __cnfn remainder(double4 x, double4 y);
+double8 __ovld __cnfn remainder(double8 x, double8 y);
+double16 __ovld __cnfn remainder(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn remainder(half x, half y);
+half2 __ovld __cnfn remainder(half2 x, half2 y);
+half3 __ovld __cnfn remainder(half3 x, half3 y);
+half4 __ovld __cnfn remainder(half4 x, half4 y);
+half8 __ovld __cnfn remainder(half8 x, half8 y);
+half16 __ovld __cnfn remainder(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * The remquo function computes the value r such
+ * that r = x - n*y, where n is the integer nearest the
+ * exact value of x/y. If there are two integers closest
+ * to x/y, n shall be the even one. If r is zero, it is
+ * given the same sign as x. This is the same value
+ * that is returned by the remainder function.
+ * remquo also calculates the lower seven bits of the
+ * integral quotient x/y, and gives that value the same
+ * sign as x/y. It stores this signed value in the object
+ * pointed to by quo.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld remquo(float x, float y, int *quo);
+float2 __ovld remquo(float2 x, float2 y, int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, int16 *quo);
+#ifdef cl_khr_fp64
+double __ovld remquo(double x, double y, int *quo);
+double2 __ovld remquo(double2 x, double2 y, int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, int16 *quo);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld remquo(half x, half y, int *quo);
+half2 __ovld remquo(half2 x, half2 y, int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, int16 *quo);
+
+#endif //cl_khr_fp16
+#else
+float __ovld remquo(float x, float y, __global int *quo);
+float2 __ovld remquo(float2 x, float2 y, __global int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __global int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __global int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __global int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __global int16 *quo);
+float __ovld remquo(float x, float y, __local int *quo);
+float2 __ovld remquo(float2 x, float2 y, __local int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __local int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __local int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __local int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __local int16 *quo);
+float __ovld remquo(float x, float y, __private int *quo);
+float2 __ovld remquo(float2 x, float2 y, __private int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __private int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __private int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __private int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __private int16 *quo);
+#ifdef cl_khr_fp64
+double __ovld remquo(double x, double y, __global int *quo);
+double2 __ovld remquo(double2 x, double2 y, __global int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __global int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __global int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __global int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __global int16 *quo);
+double __ovld remquo(double x, double y, __local int *quo);
+double2 __ovld remquo(double2 x, double2 y, __local int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __local int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __local int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __local int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __local int16 *quo);
+double __ovld remquo(double x, double y, __private int *quo);
+double2 __ovld remquo(double2 x, double2 y, __private int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __private int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __private int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __private int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __private int16 *quo);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld remquo(half x, half y, __global int *quo);
+half2 __ovld remquo(half2 x, half2 y, __global int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __global int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __global int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __global int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __global int16 *quo);
+half __ovld remquo(half x, half y, __local int *quo);
+half2 __ovld remquo(half2 x, half2 y, __local int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __local int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __local int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __local int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __local int16 *quo);
+half __ovld remquo(half x, half y, __private int *quo);
+half2 __ovld remquo(half2 x, half2 y, __private int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __private int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __private int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __private int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __private int16 *quo);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+/**
+ * Round to integral value (using round to nearest
+ * even rounding mode) in floating-point format.
+ * Refer to section 7.1 for description of rounding
+ * modes.
+ */
+float __ovld __cnfn rint(float);
+float2 __ovld __cnfn rint(float2);
+float3 __ovld __cnfn rint(float3);
+float4 __ovld __cnfn rint(float4);
+float8 __ovld __cnfn rint(float8);
+float16 __ovld __cnfn rint(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rint(double);
+double2 __ovld __cnfn rint(double2);
+double3 __ovld __cnfn rint(double3);
+double4 __ovld __cnfn rint(double4);
+double8 __ovld __cnfn rint(double8);
+double16 __ovld __cnfn rint(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rint(half);
+half2 __ovld __cnfn rint(half2);
+half3 __ovld __cnfn rint(half3);
+half4 __ovld __cnfn rint(half4);
+half8 __ovld __cnfn rint(half8);
+half16 __ovld __cnfn rint(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power 1/y.
+ */
+float __ovld __cnfn rootn(float x, int y);
+float2 __ovld __cnfn rootn(float2 x, int2 y);
+float3 __ovld __cnfn rootn(float3 x, int3 y);
+float4 __ovld __cnfn rootn(float4 x, int4 y);
+float8 __ovld __cnfn rootn(float8 x, int8 y);
+float16 __ovld __cnfn rootn(float16 x, int16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rootn(double x, int y);
+double2 __ovld __cnfn rootn(double2 x, int2 y);
+double3 __ovld __cnfn rootn(double3 x, int3 y);
+double4 __ovld __cnfn rootn(double4 x, int4 y);
+double8 __ovld __cnfn rootn(double8 x, int8 y);
+double16 __ovld __cnfn rootn(double16 x, int16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rootn(half x, int y);
+half2 __ovld __cnfn rootn(half2 x, int2 y);
+half3 __ovld __cnfn rootn(half3 x, int3 y);
+half4 __ovld __cnfn rootn(half4 x, int4 y);
+half8 __ovld __cnfn rootn(half8 x, int8 y);
+half16 __ovld __cnfn rootn(half16 x, int16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Return the integral value nearest to x rounding
+ * halfway cases away from zero, regardless of the
+ * current rounding direction.
+ */
+float __ovld __cnfn round(float x);
+float2 __ovld __cnfn round(float2 x);
+float3 __ovld __cnfn round(float3 x);
+float4 __ovld __cnfn round(float4 x);
+float8 __ovld __cnfn round(float8 x);
+float16 __ovld __cnfn round(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn round(double x);
+double2 __ovld __cnfn round(double2 x);
+double3 __ovld __cnfn round(double3 x);
+double4 __ovld __cnfn round(double4 x);
+double8 __ovld __cnfn round(double8 x);
+double16 __ovld __cnfn round(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn round(half x);
+half2 __ovld __cnfn round(half2 x);
+half3 __ovld __cnfn round(half3 x);
+half4 __ovld __cnfn round(half4 x);
+half8 __ovld __cnfn round(half8 x);
+half16 __ovld __cnfn round(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute inverse square root.
+ */
+float __ovld __cnfn rsqrt(float);
+float2 __ovld __cnfn rsqrt(float2);
+float3 __ovld __cnfn rsqrt(float3);
+float4 __ovld __cnfn rsqrt(float4);
+float8 __ovld __cnfn rsqrt(float8);
+float16 __ovld __cnfn rsqrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rsqrt(double);
+double2 __ovld __cnfn rsqrt(double2);
+double3 __ovld __cnfn rsqrt(double3);
+double4 __ovld __cnfn rsqrt(double4);
+double8 __ovld __cnfn rsqrt(double8);
+double16 __ovld __cnfn rsqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rsqrt(half);
+half2 __ovld __cnfn rsqrt(half2);
+half3 __ovld __cnfn rsqrt(half3);
+half4 __ovld __cnfn rsqrt(half4);
+half8 __ovld __cnfn rsqrt(half8);
+half16 __ovld __cnfn rsqrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sine.
+ */
+float __ovld __cnfn sin(float);
+float2 __ovld __cnfn sin(float2);
+float3 __ovld __cnfn sin(float3);
+float4 __ovld __cnfn sin(float4);
+float8 __ovld __cnfn sin(float8);
+float16 __ovld __cnfn sin(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sin(double);
+double2 __ovld __cnfn sin(double2);
+double3 __ovld __cnfn sin(double3);
+double4 __ovld __cnfn sin(double4);
+double8 __ovld __cnfn sin(double8);
+double16 __ovld __cnfn sin(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sin(half);
+half2 __ovld __cnfn sin(half2);
+half3 __ovld __cnfn sin(half3);
+half4 __ovld __cnfn sin(half4);
+half8 __ovld __cnfn sin(half8);
+half16 __ovld __cnfn sin(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sine and cosine of x. The computed sine
+ * is the return value and computed cosine is returned
+ * in cosval.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld sincos(float x, float *cosval);
+float2 __ovld sincos(float2 x, float2 *cosval);
+float3 __ovld sincos(float3 x, float3 *cosval);
+float4 __ovld sincos(float4 x, float4 *cosval);
+float8 __ovld sincos(float8 x, float8 *cosval);
+float16 __ovld sincos(float16 x, float16 *cosval);
+#ifdef cl_khr_fp64
+double __ovld sincos(double x, double *cosval);
+double2 __ovld sincos(double2 x, double2 *cosval);
+double3 __ovld sincos(double3 x, double3 *cosval);
+double4 __ovld sincos(double4 x, double4 *cosval);
+double8 __ovld sincos(double8 x, double8 *cosval);
+double16 __ovld sincos(double16 x, double16 *cosval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld sincos(half x, half *cosval);
+half2 __ovld sincos(half2 x, half2 *cosval);
+half3 __ovld sincos(half3 x, half3 *cosval);
+half4 __ovld sincos(half4 x, half4 *cosval);
+half8 __ovld sincos(half8 x, half8 *cosval);
+half16 __ovld sincos(half16 x, half16 *cosval);
+#endif //cl_khr_fp16
+#else
+float __ovld sincos(float x, __global float *cosval);
+float2 __ovld sincos(float2 x, __global float2 *cosval);
+float3 __ovld sincos(float3 x, __global float3 *cosval);
+float4 __ovld sincos(float4 x, __global float4 *cosval);
+float8 __ovld sincos(float8 x, __global float8 *cosval);
+float16 __ovld sincos(float16 x, __global float16 *cosval);
+float __ovld sincos(float x, __local float *cosval);
+float2 __ovld sincos(float2 x, __local float2 *cosval);
+float3 __ovld sincos(float3 x, __local float3 *cosval);
+float4 __ovld sincos(float4 x, __local float4 *cosval);
+float8 __ovld sincos(float8 x, __local float8 *cosval);
+float16 __ovld sincos(float16 x, __local float16 *cosval);
+float __ovld sincos(float x, __private float *cosval);
+float2 __ovld sincos(float2 x, __private float2 *cosval);
+float3 __ovld sincos(float3 x, __private float3 *cosval);
+float4 __ovld sincos(float4 x, __private float4 *cosval);
+float8 __ovld sincos(float8 x, __private float8 *cosval);
+float16 __ovld sincos(float16 x, __private float16 *cosval);
+#ifdef cl_khr_fp64
+double __ovld sincos(double x, __global double *cosval);
+double2 __ovld sincos(double2 x, __global double2 *cosval);
+double3 __ovld sincos(double3 x, __global double3 *cosval);
+double4 __ovld sincos(double4 x, __global double4 *cosval);
+double8 __ovld sincos(double8 x, __global double8 *cosval);
+double16 __ovld sincos(double16 x, __global double16 *cosval);
+double __ovld sincos(double x, __local double *cosval);
+double2 __ovld sincos(double2 x, __local double2 *cosval);
+double3 __ovld sincos(double3 x, __local double3 *cosval);
+double4 __ovld sincos(double4 x, __local double4 *cosval);
+double8 __ovld sincos(double8 x, __local double8 *cosval);
+double16 __ovld sincos(double16 x, __local double16 *cosval);
+double __ovld sincos(double x, __private double *cosval);
+double2 __ovld sincos(double2 x, __private double2 *cosval);
+double3 __ovld sincos(double3 x, __private double3 *cosval);
+double4 __ovld sincos(double4 x, __private double4 *cosval);
+double8 __ovld sincos(double8 x, __private double8 *cosval);
+double16 __ovld sincos(double16 x, __private double16 *cosval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld sincos(half x, __global half *cosval);
+half2 __ovld sincos(half2 x, __global half2 *cosval);
+half3 __ovld sincos(half3 x, __global half3 *cosval);
+half4 __ovld sincos(half4 x, __global half4 *cosval);
+half8 __ovld sincos(half8 x, __global half8 *cosval);
+half16 __ovld sincos(half16 x, __global half16 *cosval);
+half __ovld sincos(half x, __local half *cosval);
+half2 __ovld sincos(half2 x, __local half2 *cosval);
+half3 __ovld sincos(half3 x, __local half3 *cosval);
+half4 __ovld sincos(half4 x, __local half4 *cosval);
+half8 __ovld sincos(half8 x, __local half8 *cosval);
+half16 __ovld sincos(half16 x, __local half16 *cosval);
+half __ovld sincos(half x, __private half *cosval);
+half2 __ovld sincos(half2 x, __private half2 *cosval);
+half3 __ovld sincos(half3 x, __private half3 *cosval);
+half4 __ovld sincos(half4 x, __private half4 *cosval);
+half8 __ovld sincos(half8 x, __private half8 *cosval);
+half16 __ovld sincos(half16 x, __private half16 *cosval);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute hyperbolic sine.
+ */
+float __ovld __cnfn sinh(float);
+float2 __ovld __cnfn sinh(float2);
+float3 __ovld __cnfn sinh(float3);
+float4 __ovld __cnfn sinh(float4);
+float8 __ovld __cnfn sinh(float8);
+float16 __ovld __cnfn sinh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sinh(double);
+double2 __ovld __cnfn sinh(double2);
+double3 __ovld __cnfn sinh(double3);
+double4 __ovld __cnfn sinh(double4);
+double8 __ovld __cnfn sinh(double8);
+double16 __ovld __cnfn sinh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sinh(half);
+half2 __ovld __cnfn sinh(half2);
+half3 __ovld __cnfn sinh(half3);
+half4 __ovld __cnfn sinh(half4);
+half8 __ovld __cnfn sinh(half8);
+half16 __ovld __cnfn sinh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sin (PI * x).
+ */
+float __ovld __cnfn sinpi(float x);
+float2 __ovld __cnfn sinpi(float2 x);
+float3 __ovld __cnfn sinpi(float3 x);
+float4 __ovld __cnfn sinpi(float4 x);
+float8 __ovld __cnfn sinpi(float8 x);
+float16 __ovld __cnfn sinpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sinpi(double x);
+double2 __ovld __cnfn sinpi(double2 x);
+double3 __ovld __cnfn sinpi(double3 x);
+double4 __ovld __cnfn sinpi(double4 x);
+double8 __ovld __cnfn sinpi(double8 x);
+double16 __ovld __cnfn sinpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sinpi(half x);
+half2 __ovld __cnfn sinpi(half2 x);
+half3 __ovld __cnfn sinpi(half3 x);
+half4 __ovld __cnfn sinpi(half4 x);
+half8 __ovld __cnfn sinpi(half8 x);
+half16 __ovld __cnfn sinpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute square root.
+ */
+float __ovld __cnfn sqrt(float);
+float2 __ovld __cnfn sqrt(float2);
+float3 __ovld __cnfn sqrt(float3);
+float4 __ovld __cnfn sqrt(float4);
+float8 __ovld __cnfn sqrt(float8);
+float16 __ovld __cnfn sqrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sqrt(double);
+double2 __ovld __cnfn sqrt(double2);
+double3 __ovld __cnfn sqrt(double3);
+double4 __ovld __cnfn sqrt(double4);
+double8 __ovld __cnfn sqrt(double8);
+double16 __ovld __cnfn sqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sqrt(half);
+half2 __ovld __cnfn sqrt(half2);
+half3 __ovld __cnfn sqrt(half3);
+half4 __ovld __cnfn sqrt(half4);
+half8 __ovld __cnfn sqrt(half8);
+half16 __ovld __cnfn sqrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute tangent.
+ */
+float __ovld __cnfn tan(float);
+float2 __ovld __cnfn tan(float2);
+float3 __ovld __cnfn tan(float3);
+float4 __ovld __cnfn tan(float4);
+float8 __ovld __cnfn tan(float8);
+float16 __ovld __cnfn tan(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tan(double);
+double2 __ovld __cnfn tan(double2);
+double3 __ovld __cnfn tan(double3);
+double4 __ovld __cnfn tan(double4);
+double8 __ovld __cnfn tan(double8);
+double16 __ovld __cnfn tan(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tan(half);
+half2 __ovld __cnfn tan(half2);
+half3 __ovld __cnfn tan(half3);
+half4 __ovld __cnfn tan(half4);
+half8 __ovld __cnfn tan(half8);
+half16 __ovld __cnfn tan(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute hyperbolic tangent.
+ */
+float __ovld __cnfn tanh(float);
+float2 __ovld __cnfn tanh(float2);
+float3 __ovld __cnfn tanh(float3);
+float4 __ovld __cnfn tanh(float4);
+float8 __ovld __cnfn tanh(float8);
+float16 __ovld __cnfn tanh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tanh(double);
+double2 __ovld __cnfn tanh(double2);
+double3 __ovld __cnfn tanh(double3);
+double4 __ovld __cnfn tanh(double4);
+double8 __ovld __cnfn tanh(double8);
+double16 __ovld __cnfn tanh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tanh(half);
+half2 __ovld __cnfn tanh(half2);
+half3 __ovld __cnfn tanh(half3);
+half4 __ovld __cnfn tanh(half4);
+half8 __ovld __cnfn tanh(half8);
+half16 __ovld __cnfn tanh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute tan (PI * x).
+ */
+float __ovld __cnfn tanpi(float x);
+float2 __ovld __cnfn tanpi(float2 x);
+float3 __ovld __cnfn tanpi(float3 x);
+float4 __ovld __cnfn tanpi(float4 x);
+float8 __ovld __cnfn tanpi(float8 x);
+float16 __ovld __cnfn tanpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tanpi(double x);
+double2 __ovld __cnfn tanpi(double2 x);
+double3 __ovld __cnfn tanpi(double3 x);
+double4 __ovld __cnfn tanpi(double4 x);
+double8 __ovld __cnfn tanpi(double8 x);
+double16 __ovld __cnfn tanpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tanpi(half x);
+half2 __ovld __cnfn tanpi(half2 x);
+half3 __ovld __cnfn tanpi(half3 x);
+half4 __ovld __cnfn tanpi(half4 x);
+half8 __ovld __cnfn tanpi(half8 x);
+half16 __ovld __cnfn tanpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the gamma function.
+ */
+float __ovld __cnfn tgamma(float);
+float2 __ovld __cnfn tgamma(float2);
+float3 __ovld __cnfn tgamma(float3);
+float4 __ovld __cnfn tgamma(float4);
+float8 __ovld __cnfn tgamma(float8);
+float16 __ovld __cnfn tgamma(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tgamma(double);
+double2 __ovld __cnfn tgamma(double2);
+double3 __ovld __cnfn tgamma(double3);
+double4 __ovld __cnfn tgamma(double4);
+double8 __ovld __cnfn tgamma(double8);
+double16 __ovld __cnfn tgamma(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tgamma(half);
+half2 __ovld __cnfn tgamma(half2);
+half3 __ovld __cnfn tgamma(half3);
+half4 __ovld __cnfn tgamma(half4);
+half8 __ovld __cnfn tgamma(half8);
+half16 __ovld __cnfn tgamma(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to zero
+ * rounding mode.
+ */
+float __ovld __cnfn trunc(float);
+float2 __ovld __cnfn trunc(float2);
+float3 __ovld __cnfn trunc(float3);
+float4 __ovld __cnfn trunc(float4);
+float8 __ovld __cnfn trunc(float8);
+float16 __ovld __cnfn trunc(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn trunc(double);
+double2 __ovld __cnfn trunc(double2);
+double3 __ovld __cnfn trunc(double3);
+double4 __ovld __cnfn trunc(double4);
+double8 __ovld __cnfn trunc(double8);
+double16 __ovld __cnfn trunc(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn trunc(half);
+half2 __ovld __cnfn trunc(half2);
+half3 __ovld __cnfn trunc(half3);
+half4 __ovld __cnfn trunc(half4);
+half8 __ovld __cnfn trunc(half8);
+half16 __ovld __cnfn trunc(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cosine. x must be in the range -2^16 ... +2^16.
+ */
+float __ovld __cnfn half_cos(float x);
+float2 __ovld __cnfn half_cos(float2 x);
+float3 __ovld __cnfn half_cos(float3 x);
+float4 __ovld __cnfn half_cos(float4 x);
+float8 __ovld __cnfn half_cos(float8 x);
+float16 __ovld __cnfn half_cos(float16 x);
+
+/**
+ * Compute x / y.
+ */
+float __ovld __cnfn half_divide(float x, float y);
+float2 __ovld __cnfn half_divide(float2 x, float2 y);
+float3 __ovld __cnfn half_divide(float3 x, float3 y);
+float4 __ovld __cnfn half_divide(float4 x, float4 y);
+float8 __ovld __cnfn half_divide(float8 x, float8 y);
+float16 __ovld __cnfn half_divide(float16 x, float16 y);
+
+/**
+ * Compute the base- e exponential of x.
+ */
+float __ovld __cnfn half_exp(float x);
+float2 __ovld __cnfn half_exp(float2 x);
+float3 __ovld __cnfn half_exp(float3 x);
+float4 __ovld __cnfn half_exp(float4 x);
+float8 __ovld __cnfn half_exp(float8 x);
+float16 __ovld __cnfn half_exp(float16 x);
+
+/**
+ * Compute the base- 2 exponential of x.
+ */
+float __ovld __cnfn half_exp2(float x);
+float2 __ovld __cnfn half_exp2(float2 x);
+float3 __ovld __cnfn half_exp2(float3 x);
+float4 __ovld __cnfn half_exp2(float4 x);
+float8 __ovld __cnfn half_exp2(float8 x);
+float16 __ovld __cnfn half_exp2(float16 x);
+
+/**
+ * Compute the base- 10 exponential of x.
+ */
+float __ovld __cnfn half_exp10(float x);
+float2 __ovld __cnfn half_exp10(float2 x);
+float3 __ovld __cnfn half_exp10(float3 x);
+float4 __ovld __cnfn half_exp10(float4 x);
+float8 __ovld __cnfn half_exp10(float8 x);
+float16 __ovld __cnfn half_exp10(float16 x);
+
+/**
+ * Compute natural logarithm.
+ */
+float __ovld __cnfn half_log(float x);
+float2 __ovld __cnfn half_log(float2 x);
+float3 __ovld __cnfn half_log(float3 x);
+float4 __ovld __cnfn half_log(float4 x);
+float8 __ovld __cnfn half_log(float8 x);
+float16 __ovld __cnfn half_log(float16 x);
+
+/**
+ * Compute a base 2 logarithm.
+ */
+float __ovld __cnfn half_log2(float x);
+float2 __ovld __cnfn half_log2(float2 x);
+float3 __ovld __cnfn half_log2(float3 x);
+float4 __ovld __cnfn half_log2(float4 x);
+float8 __ovld __cnfn half_log2(float8 x);
+float16 __ovld __cnfn half_log2(float16 x);
+
+/**
+ * Compute a base 10 logarithm.
+ */
+float __ovld __cnfn half_log10(float x);
+float2 __ovld __cnfn half_log10(float2 x);
+float3 __ovld __cnfn half_log10(float3 x);
+float4 __ovld __cnfn half_log10(float4 x);
+float8 __ovld __cnfn half_log10(float8 x);
+float16 __ovld __cnfn half_log10(float16 x);
+
+/**
+ * Compute x to the power y, where x is >= 0.
+ */
+float __ovld __cnfn half_powr(float x, float y);
+float2 __ovld __cnfn half_powr(float2 x, float2 y);
+float3 __ovld __cnfn half_powr(float3 x, float3 y);
+float4 __ovld __cnfn half_powr(float4 x, float4 y);
+float8 __ovld __cnfn half_powr(float8 x, float8 y);
+float16 __ovld __cnfn half_powr(float16 x, float16 y);
+
+/**
+ * Compute reciprocal.
+ */
+float __ovld __cnfn half_recip(float x);
+float2 __ovld __cnfn half_recip(float2 x);
+float3 __ovld __cnfn half_recip(float3 x);
+float4 __ovld __cnfn half_recip(float4 x);
+float8 __ovld __cnfn half_recip(float8 x);
+float16 __ovld __cnfn half_recip(float16 x);
+
+/**
+ * Compute inverse square root.
+ */
+float __ovld __cnfn half_rsqrt(float x);
+float2 __ovld __cnfn half_rsqrt(float2 x);
+float3 __ovld __cnfn half_rsqrt(float3 x);
+float4 __ovld __cnfn half_rsqrt(float4 x);
+float8 __ovld __cnfn half_rsqrt(float8 x);
+float16 __ovld __cnfn half_rsqrt(float16 x);
+
+/**
+ * Compute sine. x must be in the range -2^16 ... +2^16.
+ */
+float __ovld __cnfn half_sin(float x);
+float2 __ovld __cnfn half_sin(float2 x);
+float3 __ovld __cnfn half_sin(float3 x);
+float4 __ovld __cnfn half_sin(float4 x);
+float8 __ovld __cnfn half_sin(float8 x);
+float16 __ovld __cnfn half_sin(float16 x);
+
+/**
+ * Compute square root.
+ */
+float __ovld __cnfn half_sqrt(float x);
+float2 __ovld __cnfn half_sqrt(float2 x);
+float3 __ovld __cnfn half_sqrt(float3 x);
+float4 __ovld __cnfn half_sqrt(float4 x);
+float8 __ovld __cnfn half_sqrt(float8 x);
+float16 __ovld __cnfn half_sqrt(float16 x);
+
+/**
+ * Compute tangent. x must be in the range -216 ... +216.
+ */
+float __ovld __cnfn half_tan(float x);
+float2 __ovld __cnfn half_tan(float2 x);
+float3 __ovld __cnfn half_tan(float3 x);
+float4 __ovld __cnfn half_tan(float4 x);
+float8 __ovld __cnfn half_tan(float8 x);
+float16 __ovld __cnfn half_tan(float16 x);
+
+/**
+ * Compute cosine over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_cos(float x);
+float2 __ovld __cnfn native_cos(float2 x);
+float3 __ovld __cnfn native_cos(float3 x);
+float4 __ovld __cnfn native_cos(float4 x);
+float8 __ovld __cnfn native_cos(float8 x);
+float16 __ovld __cnfn native_cos(float16 x);
+
+/**
+ * Compute x / y over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_divide(float x, float y);
+float2 __ovld __cnfn native_divide(float2 x, float2 y);
+float3 __ovld __cnfn native_divide(float3 x, float3 y);
+float4 __ovld __cnfn native_divide(float4 x, float4 y);
+float8 __ovld __cnfn native_divide(float8 x, float8 y);
+float16 __ovld __cnfn native_divide(float16 x, float16 y);
+
+/**
+ * Compute the base- e exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp(float x);
+float2 __ovld __cnfn native_exp(float2 x);
+float3 __ovld __cnfn native_exp(float3 x);
+float4 __ovld __cnfn native_exp(float4 x);
+float8 __ovld __cnfn native_exp(float8 x);
+float16 __ovld __cnfn native_exp(float16 x);
+
+/**
+ * Compute the base- 2 exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp2(float x);
+float2 __ovld __cnfn native_exp2(float2 x);
+float3 __ovld __cnfn native_exp2(float3 x);
+float4 __ovld __cnfn native_exp2(float4 x);
+float8 __ovld __cnfn native_exp2(float8 x);
+float16 __ovld __cnfn native_exp2(float16 x);
+
+/**
+ * Compute the base- 10 exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp10(float x);
+float2 __ovld __cnfn native_exp10(float2 x);
+float3 __ovld __cnfn native_exp10(float3 x);
+float4 __ovld __cnfn native_exp10(float4 x);
+float8 __ovld __cnfn native_exp10(float8 x);
+float16 __ovld __cnfn native_exp10(float16 x);
+
+/**
+ * Compute natural logarithm over an implementationdefined
+ * range. The maximum error is implementation
+ * defined.
+ */
+float __ovld __cnfn native_log(float x);
+float2 __ovld __cnfn native_log(float2 x);
+float3 __ovld __cnfn native_log(float3 x);
+float4 __ovld __cnfn native_log(float4 x);
+float8 __ovld __cnfn native_log(float8 x);
+float16 __ovld __cnfn native_log(float16 x);
+
+/**
+ * Compute a base 2 logarithm over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_log2(float x);
+float2 __ovld __cnfn native_log2(float2 x);
+float3 __ovld __cnfn native_log2(float3 x);
+float4 __ovld __cnfn native_log2(float4 x);
+float8 __ovld __cnfn native_log2(float8 x);
+float16 __ovld __cnfn native_log2(float16 x);
+
+/**
+ * Compute a base 10 logarithm over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_log10(float x);
+float2 __ovld __cnfn native_log10(float2 x);
+float3 __ovld __cnfn native_log10(float3 x);
+float4 __ovld __cnfn native_log10(float4 x);
+float8 __ovld __cnfn native_log10(float8 x);
+float16 __ovld __cnfn native_log10(float16 x);
+
+/**
+ * Compute x to the power y, where x is >= 0. The range of
+ * x and y are implementation-defined. The maximum error
+ * is implementation-defined.
+ */
+float __ovld __cnfn native_powr(float x, float y);
+float2 __ovld __cnfn native_powr(float2 x, float2 y);
+float3 __ovld __cnfn native_powr(float3 x, float3 y);
+float4 __ovld __cnfn native_powr(float4 x, float4 y);
+float8 __ovld __cnfn native_powr(float8 x, float8 y);
+float16 __ovld __cnfn native_powr(float16 x, float16 y);
+
+/**
+ * Compute reciprocal over an implementation-defined
+ * range. The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_recip(float x);
+float2 __ovld __cnfn native_recip(float2 x);
+float3 __ovld __cnfn native_recip(float3 x);
+float4 __ovld __cnfn native_recip(float4 x);
+float8 __ovld __cnfn native_recip(float8 x);
+float16 __ovld __cnfn native_recip(float16 x);
+
+/**
+ * Compute inverse square root over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_rsqrt(float x);
+float2 __ovld __cnfn native_rsqrt(float2 x);
+float3 __ovld __cnfn native_rsqrt(float3 x);
+float4 __ovld __cnfn native_rsqrt(float4 x);
+float8 __ovld __cnfn native_rsqrt(float8 x);
+float16 __ovld __cnfn native_rsqrt(float16 x);
+
+/**
+ * Compute sine over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_sin(float x);
+float2 __ovld __cnfn native_sin(float2 x);
+float3 __ovld __cnfn native_sin(float3 x);
+float4 __ovld __cnfn native_sin(float4 x);
+float8 __ovld __cnfn native_sin(float8 x);
+float16 __ovld __cnfn native_sin(float16 x);
+
+/**
+ * Compute square root over an implementation-defined
+ * range. The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_sqrt(float x);
+float2 __ovld __cnfn native_sqrt(float2 x);
+float3 __ovld __cnfn native_sqrt(float3 x);
+float4 __ovld __cnfn native_sqrt(float4 x);
+float8 __ovld __cnfn native_sqrt(float8 x);
+float16 __ovld __cnfn native_sqrt(float16 x);
+
+/**
+ * Compute tangent over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_tan(float x);
+float2 __ovld __cnfn native_tan(float2 x);
+float3 __ovld __cnfn native_tan(float3 x);
+float4 __ovld __cnfn native_tan(float4 x);
+float8 __ovld __cnfn native_tan(float8 x);
+float16 __ovld __cnfn native_tan(float16 x);
+
+// OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions
+
+/**
+ * Returns | x |.
+ */
+uchar __ovld __cnfn abs(char x);
+uchar __ovld __cnfn abs(uchar x);
+uchar2 __ovld __cnfn abs(char2 x);
+uchar2 __ovld __cnfn abs(uchar2 x);
+uchar3 __ovld __cnfn abs(char3 x);
+uchar3 __ovld __cnfn abs(uchar3 x);
+uchar4 __ovld __cnfn abs(char4 x);
+uchar4 __ovld __cnfn abs(uchar4 x);
+uchar8 __ovld __cnfn abs(char8 x);
+uchar8 __ovld __cnfn abs(uchar8 x);
+uchar16 __ovld __cnfn abs(char16 x);
+uchar16 __ovld __cnfn abs(uchar16 x);
+ushort __ovld __cnfn abs(short x);
+ushort __ovld __cnfn abs(ushort x);
+ushort2 __ovld __cnfn abs(short2 x);
+ushort2 __ovld __cnfn abs(ushort2 x);
+ushort3 __ovld __cnfn abs(short3 x);
+ushort3 __ovld __cnfn abs(ushort3 x);
+ushort4 __ovld __cnfn abs(short4 x);
+ushort4 __ovld __cnfn abs(ushort4 x);
+ushort8 __ovld __cnfn abs(short8 x);
+ushort8 __ovld __cnfn abs(ushort8 x);
+ushort16 __ovld __cnfn abs(short16 x);
+ushort16 __ovld __cnfn abs(ushort16 x);
+uint __ovld __cnfn abs(int x);
+uint __ovld __cnfn abs(uint x);
+uint2 __ovld __cnfn abs(int2 x);
+uint2 __ovld __cnfn abs(uint2 x);
+uint3 __ovld __cnfn abs(int3 x);
+uint3 __ovld __cnfn abs(uint3 x);
+uint4 __ovld __cnfn abs(int4 x);
+uint4 __ovld __cnfn abs(uint4 x);
+uint8 __ovld __cnfn abs(int8 x);
+uint8 __ovld __cnfn abs(uint8 x);
+uint16 __ovld __cnfn abs(int16 x);
+uint16 __ovld __cnfn abs(uint16 x);
+ulong __ovld __cnfn abs(long x);
+ulong __ovld __cnfn abs(ulong x);
+ulong2 __ovld __cnfn abs(long2 x);
+ulong2 __ovld __cnfn abs(ulong2 x);
+ulong3 __ovld __cnfn abs(long3 x);
+ulong3 __ovld __cnfn abs(ulong3 x);
+ulong4 __ovld __cnfn abs(long4 x);
+ulong4 __ovld __cnfn abs(ulong4 x);
+ulong8 __ovld __cnfn abs(long8 x);
+ulong8 __ovld __cnfn abs(ulong8 x);
+ulong16 __ovld __cnfn abs(long16 x);
+ulong16 __ovld __cnfn abs(ulong16 x);
+
+/**
+ * Returns | x - y | without modulo overflow.
+ */
+uchar __ovld __cnfn abs_diff(char x, char y);
+uchar __ovld __cnfn abs_diff(uchar x, uchar y);
+uchar2 __ovld __cnfn abs_diff(char2 x, char2 y);
+uchar2 __ovld __cnfn abs_diff(uchar2 x, uchar2 y);
+uchar3 __ovld __cnfn abs_diff(char3 x, char3 y);
+uchar3 __ovld __cnfn abs_diff(uchar3 x, uchar3 y);
+uchar4 __ovld __cnfn abs_diff(char4 x, char4 y);
+uchar4 __ovld __cnfn abs_diff(uchar4 x, uchar4 y);
+uchar8 __ovld __cnfn abs_diff(char8 x, char8 y);
+uchar8 __ovld __cnfn abs_diff(uchar8 x, uchar8 y);
+uchar16 __ovld __cnfn abs_diff(char16 x, char16 y);
+uchar16 __ovld __cnfn abs_diff(uchar16 x, uchar16 y);
+ushort __ovld __cnfn abs_diff(short x, short y);
+ushort __ovld __cnfn abs_diff(ushort x, ushort y);
+ushort2 __ovld __cnfn abs_diff(short2 x, short2 y);
+ushort2 __ovld __cnfn abs_diff(ushort2 x, ushort2 y);
+ushort3 __ovld __cnfn abs_diff(short3 x, short3 y);
+ushort3 __ovld __cnfn abs_diff(ushort3 x, ushort3 y);
+ushort4 __ovld __cnfn abs_diff(short4 x, short4 y);
+ushort4 __ovld __cnfn abs_diff(ushort4 x, ushort4 y);
+ushort8 __ovld __cnfn abs_diff(short8 x, short8 y);
+ushort8 __ovld __cnfn abs_diff(ushort8 x, ushort8 y);
+ushort16 __ovld __cnfn abs_diff(short16 x, short16 y);
+ushort16 __ovld __cnfn abs_diff(ushort16 x, ushort16 y);
+uint __ovld __cnfn abs_diff(int x, int y);
+uint __ovld __cnfn abs_diff(uint x, uint y);
+uint2 __ovld __cnfn abs_diff(int2 x, int2 y);
+uint2 __ovld __cnfn abs_diff(uint2 x, uint2 y);
+uint3 __ovld __cnfn abs_diff(int3 x, int3 y);
+uint3 __ovld __cnfn abs_diff(uint3 x, uint3 y);
+uint4 __ovld __cnfn abs_diff(int4 x, int4 y);
+uint4 __ovld __cnfn abs_diff(uint4 x, uint4 y);
+uint8 __ovld __cnfn abs_diff(int8 x, int8 y);
+uint8 __ovld __cnfn abs_diff(uint8 x, uint8 y);
+uint16 __ovld __cnfn abs_diff(int16 x, int16 y);
+uint16 __ovld __cnfn abs_diff(uint16 x, uint16 y);
+ulong __ovld __cnfn abs_diff(long x, long y);
+ulong __ovld __cnfn abs_diff(ulong x, ulong y);
+ulong2 __ovld __cnfn abs_diff(long2 x, long2 y);
+ulong2 __ovld __cnfn abs_diff(ulong2 x, ulong2 y);
+ulong3 __ovld __cnfn abs_diff(long3 x, long3 y);
+ulong3 __ovld __cnfn abs_diff(ulong3 x, ulong3 y);
+ulong4 __ovld __cnfn abs_diff(long4 x, long4 y);
+ulong4 __ovld __cnfn abs_diff(ulong4 x, ulong4 y);
+ulong8 __ovld __cnfn abs_diff(long8 x, long8 y);
+ulong8 __ovld __cnfn abs_diff(ulong8 x, ulong8 y);
+ulong16 __ovld __cnfn abs_diff(long16 x, long16 y);
+ulong16 __ovld __cnfn abs_diff(ulong16 x, ulong16 y);
+
+/**
+ * Returns x + y and saturates the result.
+ */
+char __ovld __cnfn add_sat(char x, char y);
+uchar __ovld __cnfn add_sat(uchar x, uchar y);
+char2 __ovld __cnfn add_sat(char2 x, char2 y);
+uchar2 __ovld __cnfn add_sat(uchar2 x, uchar2 y);
+char3 __ovld __cnfn add_sat(char3 x, char3 y);
+uchar3 __ovld __cnfn add_sat(uchar3 x, uchar3 y);
+char4 __ovld __cnfn add_sat(char4 x, char4 y);
+uchar4 __ovld __cnfn add_sat(uchar4 x, uchar4 y);
+char8 __ovld __cnfn add_sat(char8 x, char8 y);
+uchar8 __ovld __cnfn add_sat(uchar8 x, uchar8 y);
+char16 __ovld __cnfn add_sat(char16 x, char16 y);
+uchar16 __ovld __cnfn add_sat(uchar16 x, uchar16 y);
+short __ovld __cnfn add_sat(short x, short y);
+ushort __ovld __cnfn add_sat(ushort x, ushort y);
+short2 __ovld __cnfn add_sat(short2 x, short2 y);
+ushort2 __ovld __cnfn add_sat(ushort2 x, ushort2 y);
+short3 __ovld __cnfn add_sat(short3 x, short3 y);
+ushort3 __ovld __cnfn add_sat(ushort3 x, ushort3 y);
+short4 __ovld __cnfn add_sat(short4 x, short4 y);
+ushort4 __ovld __cnfn add_sat(ushort4 x, ushort4 y);
+short8 __ovld __cnfn add_sat(short8 x, short8 y);
+ushort8 __ovld __cnfn add_sat(ushort8 x, ushort8 y);
+short16 __ovld __cnfn add_sat(short16 x, short16 y);
+ushort16 __ovld __cnfn add_sat(ushort16 x, ushort16 y);
+int __ovld __cnfn add_sat(int x, int y);
+uint __ovld __cnfn add_sat(uint x, uint y);
+int2 __ovld __cnfn add_sat(int2 x, int2 y);
+uint2 __ovld __cnfn add_sat(uint2 x, uint2 y);
+int3 __ovld __cnfn add_sat(int3 x, int3 y);
+uint3 __ovld __cnfn add_sat(uint3 x, uint3 y);
+int4 __ovld __cnfn add_sat(int4 x, int4 y);
+uint4 __ovld __cnfn add_sat(uint4 x, uint4 y);
+int8 __ovld __cnfn add_sat(int8 x, int8 y);
+uint8 __ovld __cnfn add_sat(uint8 x, uint8 y);
+int16 __ovld __cnfn add_sat(int16 x, int16 y);
+uint16 __ovld __cnfn add_sat(uint16 x, uint16 y);
+long __ovld __cnfn add_sat(long x, long y);
+ulong __ovld __cnfn add_sat(ulong x, ulong y);
+long2 __ovld __cnfn add_sat(long2 x, long2 y);
+ulong2 __ovld __cnfn add_sat(ulong2 x, ulong2 y);
+long3 __ovld __cnfn add_sat(long3 x, long3 y);
+ulong3 __ovld __cnfn add_sat(ulong3 x, ulong3 y);
+long4 __ovld __cnfn add_sat(long4 x, long4 y);
+ulong4 __ovld __cnfn add_sat(ulong4 x, ulong4 y);
+long8 __ovld __cnfn add_sat(long8 x, long8 y);
+ulong8 __ovld __cnfn add_sat(ulong8 x, ulong8 y);
+long16 __ovld __cnfn add_sat(long16 x, long16 y);
+ulong16 __ovld __cnfn add_sat(ulong16 x, ulong16 y);
+
+/**
+ * Returns (x + y) >> 1. The intermediate sum does
+ * not modulo overflow.
+ */
+char __ovld __cnfn hadd(char x, char y);
+uchar __ovld __cnfn hadd(uchar x, uchar y);
+char2 __ovld __cnfn hadd(char2 x, char2 y);
+uchar2 __ovld __cnfn hadd(uchar2 x, uchar2 y);
+char3 __ovld __cnfn hadd(char3 x, char3 y);
+uchar3 __ovld __cnfn hadd(uchar3 x, uchar3 y);
+char4 __ovld __cnfn hadd(char4 x, char4 y);
+uchar4 __ovld __cnfn hadd(uchar4 x, uchar4 y);
+char8 __ovld __cnfn hadd(char8 x, char8 y);
+uchar8 __ovld __cnfn hadd(uchar8 x, uchar8 y);
+char16 __ovld __cnfn hadd(char16 x, char16 y);
+uchar16 __ovld __cnfn hadd(uchar16 x, uchar16 y);
+short __ovld __cnfn hadd(short x, short y);
+ushort __ovld __cnfn hadd(ushort x, ushort y);
+short2 __ovld __cnfn hadd(short2 x, short2 y);
+ushort2 __ovld __cnfn hadd(ushort2 x, ushort2 y);
+short3 __ovld __cnfn hadd(short3 x, short3 y);
+ushort3 __ovld __cnfn hadd(ushort3 x, ushort3 y);
+short4 __ovld __cnfn hadd(short4 x, short4 y);
+ushort4 __ovld __cnfn hadd(ushort4 x, ushort4 y);
+short8 __ovld __cnfn hadd(short8 x, short8 y);
+ushort8 __ovld __cnfn hadd(ushort8 x, ushort8 y);
+short16 __ovld __cnfn hadd(short16 x, short16 y);
+ushort16 __ovld __cnfn hadd(ushort16 x, ushort16 y);
+int __ovld __cnfn hadd(int x, int y);
+uint __ovld __cnfn hadd(uint x, uint y);
+int2 __ovld __cnfn hadd(int2 x, int2 y);
+uint2 __ovld __cnfn hadd(uint2 x, uint2 y);
+int3 __ovld __cnfn hadd(int3 x, int3 y);
+uint3 __ovld __cnfn hadd(uint3 x, uint3 y);
+int4 __ovld __cnfn hadd(int4 x, int4 y);
+uint4 __ovld __cnfn hadd(uint4 x, uint4 y);
+int8 __ovld __cnfn hadd(int8 x, int8 y);
+uint8 __ovld __cnfn hadd(uint8 x, uint8 y);
+int16 __ovld __cnfn hadd(int16 x, int16 y);
+uint16 __ovld __cnfn hadd(uint16 x, uint16 y);
+long __ovld __cnfn hadd(long x, long y);
+ulong __ovld __cnfn hadd(ulong x, ulong y);
+long2 __ovld __cnfn hadd(long2 x, long2 y);
+ulong2 __ovld __cnfn hadd(ulong2 x, ulong2 y);
+long3 __ovld __cnfn hadd(long3 x, long3 y);
+ulong3 __ovld __cnfn hadd(ulong3 x, ulong3 y);
+long4 __ovld __cnfn hadd(long4 x, long4 y);
+ulong4 __ovld __cnfn hadd(ulong4 x, ulong4 y);
+long8 __ovld __cnfn hadd(long8 x, long8 y);
+ulong8 __ovld __cnfn hadd(ulong8 x, ulong8 y);
+long16 __ovld __cnfn hadd(long16 x, long16 y);
+ulong16 __ovld __cnfn hadd(ulong16 x, ulong16 y);
+
+/**
+ * Returns (x + y + 1) >> 1. The intermediate sum
+ * does not modulo overflow.
+ */
+char __ovld __cnfn rhadd(char x, char y);
+uchar __ovld __cnfn rhadd(uchar x, uchar y);
+char2 __ovld __cnfn rhadd(char2 x, char2 y);
+uchar2 __ovld __cnfn rhadd(uchar2 x, uchar2 y);
+char3 __ovld __cnfn rhadd(char3 x, char3 y);
+uchar3 __ovld __cnfn rhadd(uchar3 x, uchar3 y);
+char4 __ovld __cnfn rhadd(char4 x, char4 y);
+uchar4 __ovld __cnfn rhadd(uchar4 x, uchar4 y);
+char8 __ovld __cnfn rhadd(char8 x, char8 y);
+uchar8 __ovld __cnfn rhadd(uchar8 x, uchar8 y);
+char16 __ovld __cnfn rhadd(char16 x, char16 y);
+uchar16 __ovld __cnfn rhadd(uchar16 x, uchar16 y);
+short __ovld __cnfn rhadd(short x, short y);
+ushort __ovld __cnfn rhadd(ushort x, ushort y);
+short2 __ovld __cnfn rhadd(short2 x, short2 y);
+ushort2 __ovld __cnfn rhadd(ushort2 x, ushort2 y);
+short3 __ovld __cnfn rhadd(short3 x, short3 y);
+ushort3 __ovld __cnfn rhadd(ushort3 x, ushort3 y);
+short4 __ovld __cnfn rhadd(short4 x, short4 y);
+ushort4 __ovld __cnfn rhadd(ushort4 x, ushort4 y);
+short8 __ovld __cnfn rhadd(short8 x, short8 y);
+ushort8 __ovld __cnfn rhadd(ushort8 x, ushort8 y);
+short16 __ovld __cnfn rhadd(short16 x, short16 y);
+ushort16 __ovld __cnfn rhadd(ushort16 x, ushort16 y);
+int __ovld __cnfn rhadd(int x, int y);
+uint __ovld __cnfn rhadd(uint x, uint y);
+int2 __ovld __cnfn rhadd(int2 x, int2 y);
+uint2 __ovld __cnfn rhadd(uint2 x, uint2 y);
+int3 __ovld __cnfn rhadd(int3 x, int3 y);
+uint3 __ovld __cnfn rhadd(uint3 x, uint3 y);
+int4 __ovld __cnfn rhadd(int4 x, int4 y);
+uint4 __ovld __cnfn rhadd(uint4 x, uint4 y);
+int8 __ovld __cnfn rhadd(int8 x, int8 y);
+uint8 __ovld __cnfn rhadd(uint8 x, uint8 y);
+int16 __ovld __cnfn rhadd(int16 x, int16 y);
+uint16 __ovld __cnfn rhadd(uint16 x, uint16 y);
+long __ovld __cnfn rhadd(long x, long y);
+ulong __ovld __cnfn rhadd(ulong x, ulong y);
+long2 __ovld __cnfn rhadd(long2 x, long2 y);
+ulong2 __ovld __cnfn rhadd(ulong2 x, ulong2 y);
+long3 __ovld __cnfn rhadd(long3 x, long3 y);
+ulong3 __ovld __cnfn rhadd(ulong3 x, ulong3 y);
+long4 __ovld __cnfn rhadd(long4 x, long4 y);
+ulong4 __ovld __cnfn rhadd(ulong4 x, ulong4 y);
+long8 __ovld __cnfn rhadd(long8 x, long8 y);
+ulong8 __ovld __cnfn rhadd(ulong8 x, ulong8 y);
+long16 __ovld __cnfn rhadd(long16 x, long16 y);
+ulong16 __ovld __cnfn rhadd(ulong16 x, ulong16 y);
+
+/**
+ * Returns min(max(x, minval), maxval).
+ * Results are undefined if minval > maxval.
+ */
+char __ovld __cnfn clamp(char x, char minval, char maxval);
+uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
+char2 __ovld __cnfn clamp(char2 x, char2 minval, char2 maxval);
+uchar2 __ovld __cnfn clamp(uchar2 x, uchar2 minval, uchar2 maxval);
+char3 __ovld __cnfn clamp(char3 x, char3 minval, char3 maxval);
+uchar3 __ovld __cnfn clamp(uchar3 x, uchar3 minval, uchar3 maxval);
+char4 __ovld __cnfn clamp(char4 x, char4 minval, char4 maxval);
+uchar4 __ovld __cnfn clamp(uchar4 x, uchar4 minval, uchar4 maxval);
+char8 __ovld __cnfn clamp(char8 x, char8 minval, char8 maxval);
+uchar8 __ovld __cnfn clamp(uchar8 x, uchar8 minval, uchar8 maxval);
+char16 __ovld __cnfn clamp(char16 x, char16 minval, char16 maxval);
+uchar16 __ovld __cnfn clamp(uchar16 x, uchar16 minval, uchar16 maxval);
+short __ovld __cnfn clamp(short x, short minval, short maxval);
+ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
+short2 __ovld __cnfn clamp(short2 x, short2 minval, short2 maxval);
+ushort2 __ovld __cnfn clamp(ushort2 x, ushort2 minval, ushort2 maxval);
+short3 __ovld __cnfn clamp(short3 x, short3 minval, short3 maxval);
+ushort3 __ovld __cnfn clamp(ushort3 x, ushort3 minval, ushort3 maxval);
+short4 __ovld __cnfn clamp(short4 x, short4 minval, short4 maxval);
+ushort4 __ovld __cnfn clamp(ushort4 x, ushort4 minval, ushort4 maxval);
+short8 __ovld __cnfn clamp(short8 x, short8 minval, short8 maxval);
+ushort8 __ovld __cnfn clamp(ushort8 x, ushort8 minval, ushort8 maxval);
+short16 __ovld __cnfn clamp(short16 x, short16 minval, short16 maxval);
+ushort16 __ovld __cnfn clamp(ushort16 x, ushort16 minval, ushort16 maxval);
+int __ovld __cnfn clamp(int x, int minval, int maxval);
+uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
+int2 __ovld __cnfn clamp(int2 x, int2 minval, int2 maxval);
+uint2 __ovld __cnfn clamp(uint2 x, uint2 minval, uint2 maxval);
+int3 __ovld __cnfn clamp(int3 x, int3 minval, int3 maxval);
+uint3 __ovld __cnfn clamp(uint3 x, uint3 minval, uint3 maxval);
+int4 __ovld __cnfn clamp(int4 x, int4 minval, int4 maxval);
+uint4 __ovld __cnfn clamp(uint4 x, uint4 minval, uint4 maxval);
+int8 __ovld __cnfn clamp(int8 x, int8 minval, int8 maxval);
+uint8 __ovld __cnfn clamp(uint8 x, uint8 minval, uint8 maxval);
+int16 __ovld __cnfn clamp(int16 x, int16 minval, int16 maxval);
+uint16 __ovld __cnfn clamp(uint16 x, uint16 minval, uint16 maxval);
+long __ovld __cnfn clamp(long x, long minval, long maxval);
+ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
+long2 __ovld __cnfn clamp(long2 x, long2 minval, long2 maxval);
+ulong2 __ovld __cnfn clamp(ulong2 x, ulong2 minval, ulong2 maxval);
+long3 __ovld __cnfn clamp(long3 x, long3 minval, long3 maxval);
+ulong3 __ovld __cnfn clamp(ulong3 x, ulong3 minval, ulong3 maxval);
+long4 __ovld __cnfn clamp(long4 x, long4 minval, long4 maxval);
+ulong4 __ovld __cnfn clamp(ulong4 x, ulong4 minval, ulong4 maxval);
+long8 __ovld __cnfn clamp(long8 x, long8 minval, long8 maxval);
+ulong8 __ovld __cnfn clamp(ulong8 x, ulong8 minval, ulong8 maxval);
+long16 __ovld __cnfn clamp(long16 x, long16 minval, long16 maxval);
+ulong16 __ovld __cnfn clamp(ulong16 x, ulong16 minval, ulong16 maxval);
+char __ovld __cnfn clamp(char x, char minval, char maxval);
+uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
+char2 __ovld __cnfn clamp(char2 x, char minval, char maxval);
+uchar2 __ovld __cnfn clamp(uchar2 x, uchar minval, uchar maxval);
+char3 __ovld __cnfn clamp(char3 x, char minval, char maxval);
+uchar3 __ovld __cnfn clamp(uchar3 x, uchar minval, uchar maxval);
+char4 __ovld __cnfn clamp(char4 x, char minval, char maxval);
+uchar4 __ovld __cnfn clamp(uchar4 x, uchar minval, uchar maxval);
+char8 __ovld __cnfn clamp(char8 x, char minval, char maxval);
+uchar8 __ovld __cnfn clamp(uchar8 x, uchar minval, uchar maxval);
+char16 __ovld __cnfn clamp(char16 x, char minval, char maxval);
+uchar16 __ovld __cnfn clamp(uchar16 x, uchar minval, uchar maxval);
+short __ovld __cnfn clamp(short x, short minval, short maxval);
+ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
+short2 __ovld __cnfn clamp(short2 x, short minval, short maxval);
+ushort2 __ovld __cnfn clamp(ushort2 x, ushort minval, ushort maxval);
+short3 __ovld __cnfn clamp(short3 x, short minval, short maxval);
+ushort3 __ovld __cnfn clamp(ushort3 x, ushort minval, ushort maxval);
+short4 __ovld __cnfn clamp(short4 x, short minval, short maxval);
+ushort4 __ovld __cnfn clamp(ushort4 x, ushort minval, ushort maxval);
+short8 __ovld __cnfn clamp(short8 x, short minval, short maxval);
+ushort8 __ovld __cnfn clamp(ushort8 x, ushort minval, ushort maxval);
+short16 __ovld __cnfn clamp(short16 x, short minval, short maxval);
+ushort16 __ovld __cnfn clamp(ushort16 x, ushort minval, ushort maxval);
+int __ovld __cnfn clamp(int x, int minval, int maxval);
+uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
+int2 __ovld __cnfn clamp(int2 x, int minval, int maxval);
+uint2 __ovld __cnfn clamp(uint2 x, uint minval, uint maxval);
+int3 __ovld __cnfn clamp(int3 x, int minval, int maxval);
+uint3 __ovld __cnfn clamp(uint3 x, uint minval, uint maxval);
+int4 __ovld __cnfn clamp(int4 x, int minval, int maxval);
+uint4 __ovld __cnfn clamp(uint4 x, uint minval, uint maxval);
+int8 __ovld __cnfn clamp(int8 x, int minval, int maxval);
+uint8 __ovld __cnfn clamp(uint8 x, uint minval, uint maxval);
+int16 __ovld __cnfn clamp(int16 x, int minval, int maxval);
+uint16 __ovld __cnfn clamp(uint16 x, uint minval, uint maxval);
+long __ovld __cnfn clamp(long x, long minval, long maxval);
+ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
+long2 __ovld __cnfn clamp(long2 x, long minval, long maxval);
+ulong2 __ovld __cnfn clamp(ulong2 x, ulong minval, ulong maxval);
+long3 __ovld __cnfn clamp(long3 x, long minval, long maxval);
+ulong3 __ovld __cnfn clamp(ulong3 x, ulong minval, ulong maxval);
+long4 __ovld __cnfn clamp(long4 x, long minval, long maxval);
+ulong4 __ovld __cnfn clamp(ulong4 x, ulong minval, ulong maxval);
+long8 __ovld __cnfn clamp(long8 x, long minval, long maxval);
+ulong8 __ovld __cnfn clamp(ulong8 x, ulong minval, ulong maxval);
+long16 __ovld __cnfn clamp(long16 x, long minval, long maxval);
+ulong16 __ovld __cnfn clamp(ulong16 x, ulong minval, ulong maxval);
+
+/**
+ * Returns the number of leading 0-bits in x, starting
+ * at the most significant bit position.
+ */
+char __ovld __cnfn clz(char x);
+uchar __ovld __cnfn clz(uchar x);
+char2 __ovld __cnfn clz(char2 x);
+uchar2 __ovld __cnfn clz(uchar2 x);
+char3 __ovld __cnfn clz(char3 x);
+uchar3 __ovld __cnfn clz(uchar3 x);
+char4 __ovld __cnfn clz(char4 x);
+uchar4 __ovld __cnfn clz(uchar4 x);
+char8 __ovld __cnfn clz(char8 x);
+uchar8 __ovld __cnfn clz(uchar8 x);
+char16 __ovld __cnfn clz(char16 x);
+uchar16 __ovld __cnfn clz(uchar16 x);
+short __ovld __cnfn clz(short x);
+ushort __ovld __cnfn clz(ushort x);
+short2 __ovld __cnfn clz(short2 x);
+ushort2 __ovld __cnfn clz(ushort2 x);
+short3 __ovld __cnfn clz(short3 x);
+ushort3 __ovld __cnfn clz(ushort3 x);
+short4 __ovld __cnfn clz(short4 x);
+ushort4 __ovld __cnfn clz(ushort4 x);
+short8 __ovld __cnfn clz(short8 x);
+ushort8 __ovld __cnfn clz(ushort8 x);
+short16 __ovld __cnfn clz(short16 x);
+ushort16 __ovld __cnfn clz(ushort16 x);
+int __ovld __cnfn clz(int x);
+uint __ovld __cnfn clz(uint x);
+int2 __ovld __cnfn clz(int2 x);
+uint2 __ovld __cnfn clz(uint2 x);
+int3 __ovld __cnfn clz(int3 x);
+uint3 __ovld __cnfn clz(uint3 x);
+int4 __ovld __cnfn clz(int4 x);
+uint4 __ovld __cnfn clz(uint4 x);
+int8 __ovld __cnfn clz(int8 x);
+uint8 __ovld __cnfn clz(uint8 x);
+int16 __ovld __cnfn clz(int16 x);
+uint16 __ovld __cnfn clz(uint16 x);
+long __ovld __cnfn clz(long x);
+ulong __ovld __cnfn clz(ulong x);
+long2 __ovld __cnfn clz(long2 x);
+ulong2 __ovld __cnfn clz(ulong2 x);
+long3 __ovld __cnfn clz(long3 x);
+ulong3 __ovld __cnfn clz(ulong3 x);
+long4 __ovld __cnfn clz(long4 x);
+ulong4 __ovld __cnfn clz(ulong4 x);
+long8 __ovld __cnfn clz(long8 x);
+ulong8 __ovld __cnfn clz(ulong8 x);
+long16 __ovld __cnfn clz(long16 x);
+ulong16 __ovld __cnfn clz(ulong16 x);
+
+/**
+ * Returns the count of trailing 0-bits in x. If x is 0,
+ * returns the size in bits of the type of x or
+ * component type of x, if x is a vector.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+char __ovld ctz(char x);
+uchar __ovld ctz(uchar x);
+char2 __ovld ctz(char2 x);
+uchar2 __ovld ctz(uchar2 x);
+char3 __ovld ctz(char3 x);
+uchar3 __ovld ctz(uchar3 x);
+char4 __ovld ctz(char4 x);
+uchar4 __ovld ctz(uchar4 x);
+char8 __ovld ctz(char8 x);
+uchar8 __ovld ctz(uchar8 x);
+char16 __ovld ctz(char16 x);
+uchar16 __ovld ctz(uchar16 x);
+short __ovld ctz(short x);
+ushort __ovld ctz(ushort x);
+short2 __ovld ctz(short2 x);
+ushort2 __ovld ctz(ushort2 x);
+short3 __ovld ctz(short3 x);
+ushort3 __ovld ctz(ushort3 x);
+short4 __ovld ctz(short4 x);
+ushort4 __ovld ctz(ushort4 x);
+short8 __ovld ctz(short8 x);
+ushort8 __ovld ctz(ushort8 x);
+short16 __ovld ctz(short16 x);
+ushort16 __ovld ctz(ushort16 x);
+int __ovld ctz(int x);
+uint __ovld ctz(uint x);
+int2 __ovld ctz(int2 x);
+uint2 __ovld ctz(uint2 x);
+int3 __ovld ctz(int3 x);
+uint3 __ovld ctz(uint3 x);
+int4 __ovld ctz(int4 x);
+uint4 __ovld ctz(uint4 x);
+int8 __ovld ctz(int8 x);
+uint8 __ovld ctz(uint8 x);
+int16 __ovld ctz(int16 x);
+uint16 __ovld ctz(uint16 x);
+long __ovld ctz(long x);
+ulong __ovld ctz(ulong x);
+long2 __ovld ctz(long2 x);
+ulong2 __ovld ctz(ulong2 x);
+long3 __ovld ctz(long3 x);
+ulong3 __ovld ctz(ulong3 x);
+long4 __ovld ctz(long4 x);
+ulong4 __ovld ctz(ulong4 x);
+long8 __ovld ctz(long8 x);
+ulong8 __ovld ctz(ulong8 x);
+long16 __ovld ctz(long16 x);
+ulong16 __ovld ctz(ulong16 x);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Returns mul_hi(a, b) + c.
+ */
+char __ovld __cnfn mad_hi(char a, char b, char c);
+uchar __ovld __cnfn mad_hi(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn mad_hi(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn mad_hi(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn mad_hi(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn mad_hi(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn mad_hi(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn mad_hi(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn mad_hi(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn mad_hi(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn mad_hi(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn mad_hi(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn mad_hi(short a, short b, short c);
+ushort __ovld __cnfn mad_hi(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn mad_hi(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn mad_hi(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn mad_hi(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn mad_hi(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn mad_hi(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn mad_hi(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn mad_hi(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn mad_hi(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn mad_hi(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn mad_hi(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn mad_hi(int a, int b, int c);
+uint __ovld __cnfn mad_hi(uint a, uint b, uint c);
+int2 __ovld __cnfn mad_hi(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn mad_hi(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn mad_hi(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn mad_hi(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn mad_hi(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn mad_hi(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn mad_hi(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn mad_hi(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn mad_hi(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn mad_hi(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn mad_hi(long a, long b, long c);
+ulong __ovld __cnfn mad_hi(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn mad_hi(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn mad_hi(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn mad_hi(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn mad_hi(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn mad_hi(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn mad_hi(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn mad_hi(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn mad_hi(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn mad_hi(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn mad_hi(ulong16 a, ulong16 b, ulong16 c);
+
+/**
+ * Returns a * b + c and saturates the result.
+ */
+char __ovld __cnfn mad_sat(char a, char b, char c);
+uchar __ovld __cnfn mad_sat(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn mad_sat(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn mad_sat(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn mad_sat(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn mad_sat(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn mad_sat(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn mad_sat(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn mad_sat(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn mad_sat(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn mad_sat(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn mad_sat(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn mad_sat(short a, short b, short c);
+ushort __ovld __cnfn mad_sat(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn mad_sat(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn mad_sat(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn mad_sat(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn mad_sat(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn mad_sat(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn mad_sat(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn mad_sat(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn mad_sat(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn mad_sat(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn mad_sat(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn mad_sat(int a, int b, int c);
+uint __ovld __cnfn mad_sat(uint a, uint b, uint c);
+int2 __ovld __cnfn mad_sat(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn mad_sat(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn mad_sat(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn mad_sat(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn mad_sat(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn mad_sat(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn mad_sat(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn mad_sat(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn mad_sat(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn mad_sat(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn mad_sat(long a, long b, long c);
+ulong __ovld __cnfn mad_sat(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn mad_sat(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn mad_sat(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn mad_sat(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn mad_sat(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn mad_sat(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn mad_sat(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn mad_sat(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn mad_sat(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn mad_sat(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn mad_sat(ulong16 a, ulong16 b, ulong16 c);
+
+/**
+ * Returns y if x < y, otherwise it returns x.
+ */
+char __ovld __cnfn max(char x, char y);
+uchar __ovld __cnfn max(uchar x, uchar y);
+char2 __ovld __cnfn max(char2 x, char2 y);
+uchar2 __ovld __cnfn max(uchar2 x, uchar2 y);
+char3 __ovld __cnfn max(char3 x, char3 y);
+uchar3 __ovld __cnfn max(uchar3 x, uchar3 y);
+char4 __ovld __cnfn max(char4 x, char4 y);
+uchar4 __ovld __cnfn max(uchar4 x, uchar4 y);
+char8 __ovld __cnfn max(char8 x, char8 y);
+uchar8 __ovld __cnfn max(uchar8 x, uchar8 y);
+char16 __ovld __cnfn max(char16 x, char16 y);
+uchar16 __ovld __cnfn max(uchar16 x, uchar16 y);
+short __ovld __cnfn max(short x, short y);
+ushort __ovld __cnfn max(ushort x, ushort y);
+short2 __ovld __cnfn max(short2 x, short2 y);
+ushort2 __ovld __cnfn max(ushort2 x, ushort2 y);
+short3 __ovld __cnfn max(short3 x, short3 y);
+ushort3 __ovld __cnfn max(ushort3 x, ushort3 y);
+short4 __ovld __cnfn max(short4 x, short4 y);
+ushort4 __ovld __cnfn max(ushort4 x, ushort4 y);
+short8 __ovld __cnfn max(short8 x, short8 y);
+ushort8 __ovld __cnfn max(ushort8 x, ushort8 y);
+short16 __ovld __cnfn max(short16 x, short16 y);
+ushort16 __ovld __cnfn max(ushort16 x, ushort16 y);
+int __ovld __cnfn max(int x, int y);
+uint __ovld __cnfn max(uint x, uint y);
+int2 __ovld __cnfn max(int2 x, int2 y);
+uint2 __ovld __cnfn max(uint2 x, uint2 y);
+int3 __ovld __cnfn max(int3 x, int3 y);
+uint3 __ovld __cnfn max(uint3 x, uint3 y);
+int4 __ovld __cnfn max(int4 x, int4 y);
+uint4 __ovld __cnfn max(uint4 x, uint4 y);
+int8 __ovld __cnfn max(int8 x, int8 y);
+uint8 __ovld __cnfn max(uint8 x, uint8 y);
+int16 __ovld __cnfn max(int16 x, int16 y);
+uint16 __ovld __cnfn max(uint16 x, uint16 y);
+long __ovld __cnfn max(long x, long y);
+ulong __ovld __cnfn max(ulong x, ulong y);
+long2 __ovld __cnfn max(long2 x, long2 y);
+ulong2 __ovld __cnfn max(ulong2 x, ulong2 y);
+long3 __ovld __cnfn max(long3 x, long3 y);
+ulong3 __ovld __cnfn max(ulong3 x, ulong3 y);
+long4 __ovld __cnfn max(long4 x, long4 y);
+ulong4 __ovld __cnfn max(ulong4 x, ulong4 y);
+long8 __ovld __cnfn max(long8 x, long8 y);
+ulong8 __ovld __cnfn max(ulong8 x, ulong8 y);
+long16 __ovld __cnfn max(long16 x, long16 y);
+ulong16 __ovld __cnfn max(ulong16 x, ulong16 y);
+char __ovld __cnfn max(char x, char y);
+uchar __ovld __cnfn max(uchar x, uchar y);
+char2 __ovld __cnfn max(char2 x, char y);
+uchar2 __ovld __cnfn max(uchar2 x, uchar y);
+char3 __ovld __cnfn max(char3 x, char y);
+uchar3 __ovld __cnfn max(uchar3 x, uchar y);
+char4 __ovld __cnfn max(char4 x, char y);
+uchar4 __ovld __cnfn max(uchar4 x, uchar y);
+char8 __ovld __cnfn max(char8 x, char y);
+uchar8 __ovld __cnfn max(uchar8 x, uchar y);
+char16 __ovld __cnfn max(char16 x, char y);
+uchar16 __ovld __cnfn max(uchar16 x, uchar y);
+short __ovld __cnfn max(short x, short y);
+ushort __ovld __cnfn max(ushort x, ushort y);
+short2 __ovld __cnfn max(short2 x, short y);
+ushort2 __ovld __cnfn max(ushort2 x, ushort y);
+short3 __ovld __cnfn max(short3 x, short y);
+ushort3 __ovld __cnfn max(ushort3 x, ushort y);
+short4 __ovld __cnfn max(short4 x, short y);
+ushort4 __ovld __cnfn max(ushort4 x, ushort y);
+short8 __ovld __cnfn max(short8 x, short y);
+ushort8 __ovld __cnfn max(ushort8 x, ushort y);
+short16 __ovld __cnfn max(short16 x, short y);
+ushort16 __ovld __cnfn max(ushort16 x, ushort y);
+int __ovld __cnfn max(int x, int y);
+uint __ovld __cnfn max(uint x, uint y);
+int2 __ovld __cnfn max(int2 x, int y);
+uint2 __ovld __cnfn max(uint2 x, uint y);
+int3 __ovld __cnfn max(int3 x, int y);
+uint3 __ovld __cnfn max(uint3 x, uint y);
+int4 __ovld __cnfn max(int4 x, int y);
+uint4 __ovld __cnfn max(uint4 x, uint y);
+int8 __ovld __cnfn max(int8 x, int y);
+uint8 __ovld __cnfn max(uint8 x, uint y);
+int16 __ovld __cnfn max(int16 x, int y);
+uint16 __ovld __cnfn max(uint16 x, uint y);
+long __ovld __cnfn max(long x, long y);
+ulong __ovld __cnfn max(ulong x, ulong y);
+long2 __ovld __cnfn max(long2 x, long y);
+ulong2 __ovld __cnfn max(ulong2 x, ulong y);
+long3 __ovld __cnfn max(long3 x, long y);
+ulong3 __ovld __cnfn max(ulong3 x, ulong y);
+long4 __ovld __cnfn max(long4 x, long y);
+ulong4 __ovld __cnfn max(ulong4 x, ulong y);
+long8 __ovld __cnfn max(long8 x, long y);
+ulong8 __ovld __cnfn max(ulong8 x, ulong y);
+long16 __ovld __cnfn max(long16 x, long y);
+ulong16 __ovld __cnfn max(ulong16 x, ulong y);
+
+/**
+ * Returns y if y < x, otherwise it returns x.
+ */
+char __ovld __cnfn min(char x, char y);
+uchar __ovld __cnfn min(uchar x, uchar y);
+char2 __ovld __cnfn min(char2 x, char2 y);
+uchar2 __ovld __cnfn min(uchar2 x, uchar2 y);
+char3 __ovld __cnfn min(char3 x, char3 y);
+uchar3 __ovld __cnfn min(uchar3 x, uchar3 y);
+char4 __ovld __cnfn min(char4 x, char4 y);
+uchar4 __ovld __cnfn min(uchar4 x, uchar4 y);
+char8 __ovld __cnfn min(char8 x, char8 y);
+uchar8 __ovld __cnfn min(uchar8 x, uchar8 y);
+char16 __ovld __cnfn min(char16 x, char16 y);
+uchar16 __ovld __cnfn min(uchar16 x, uchar16 y);
+short __ovld __cnfn min(short x, short y);
+ushort __ovld __cnfn min(ushort x, ushort y);
+short2 __ovld __cnfn min(short2 x, short2 y);
+ushort2 __ovld __cnfn min(ushort2 x, ushort2 y);
+short3 __ovld __cnfn min(short3 x, short3 y);
+ushort3 __ovld __cnfn min(ushort3 x, ushort3 y);
+short4 __ovld __cnfn min(short4 x, short4 y);
+ushort4 __ovld __cnfn min(ushort4 x, ushort4 y);
+short8 __ovld __cnfn min(short8 x, short8 y);
+ushort8 __ovld __cnfn min(ushort8 x, ushort8 y);
+short16 __ovld __cnfn min(short16 x, short16 y);
+ushort16 __ovld __cnfn min(ushort16 x, ushort16 y);
+int __ovld __cnfn min(int x, int y);
+uint __ovld __cnfn min(uint x, uint y);
+int2 __ovld __cnfn min(int2 x, int2 y);
+uint2 __ovld __cnfn min(uint2 x, uint2 y);
+int3 __ovld __cnfn min(int3 x, int3 y);
+uint3 __ovld __cnfn min(uint3 x, uint3 y);
+int4 __ovld __cnfn min(int4 x, int4 y);
+uint4 __ovld __cnfn min(uint4 x, uint4 y);
+int8 __ovld __cnfn min(int8 x, int8 y);
+uint8 __ovld __cnfn min(uint8 x, uint8 y);
+int16 __ovld __cnfn min(int16 x, int16 y);
+uint16 __ovld __cnfn min(uint16 x, uint16 y);
+long __ovld __cnfn min(long x, long y);
+ulong __ovld __cnfn min(ulong x, ulong y);
+long2 __ovld __cnfn min(long2 x, long2 y);
+ulong2 __ovld __cnfn min(ulong2 x, ulong2 y);
+long3 __ovld __cnfn min(long3 x, long3 y);
+ulong3 __ovld __cnfn min(ulong3 x, ulong3 y);
+long4 __ovld __cnfn min(long4 x, long4 y);
+ulong4 __ovld __cnfn min(ulong4 x, ulong4 y);
+long8 __ovld __cnfn min(long8 x, long8 y);
+ulong8 __ovld __cnfn min(ulong8 x, ulong8 y);
+long16 __ovld __cnfn min(long16 x, long16 y);
+ulong16 __ovld __cnfn min(ulong16 x, ulong16 y);
+char __ovld __cnfn min(char x, char y);
+uchar __ovld __cnfn min(uchar x, uchar y);
+char2 __ovld __cnfn min(char2 x, char y);
+uchar2 __ovld __cnfn min(uchar2 x, uchar y);
+char3 __ovld __cnfn min(char3 x, char y);
+uchar3 __ovld __cnfn min(uchar3 x, uchar y);
+char4 __ovld __cnfn min(char4 x, char y);
+uchar4 __ovld __cnfn min(uchar4 x, uchar y);
+char8 __ovld __cnfn min(char8 x, char y);
+uchar8 __ovld __cnfn min(uchar8 x, uchar y);
+char16 __ovld __cnfn min(char16 x, char y);
+uchar16 __ovld __cnfn min(uchar16 x, uchar y);
+short __ovld __cnfn min(short x, short y);
+ushort __ovld __cnfn min(ushort x, ushort y);
+short2 __ovld __cnfn min(short2 x, short y);
+ushort2 __ovld __cnfn min(ushort2 x, ushort y);
+short3 __ovld __cnfn min(short3 x, short y);
+ushort3 __ovld __cnfn min(ushort3 x, ushort y);
+short4 __ovld __cnfn min(short4 x, short y);
+ushort4 __ovld __cnfn min(ushort4 x, ushort y);
+short8 __ovld __cnfn min(short8 x, short y);
+ushort8 __ovld __cnfn min(ushort8 x, ushort y);
+short16 __ovld __cnfn min(short16 x, short y);
+ushort16 __ovld __cnfn min(ushort16 x, ushort y);
+int __ovld __cnfn min(int x, int y);
+uint __ovld __cnfn min(uint x, uint y);
+int2 __ovld __cnfn min(int2 x, int y);
+uint2 __ovld __cnfn min(uint2 x, uint y);
+int3 __ovld __cnfn min(int3 x, int y);
+uint3 __ovld __cnfn min(uint3 x, uint y);
+int4 __ovld __cnfn min(int4 x, int y);
+uint4 __ovld __cnfn min(uint4 x, uint y);
+int8 __ovld __cnfn min(int8 x, int y);
+uint8 __ovld __cnfn min(uint8 x, uint y);
+int16 __ovld __cnfn min(int16 x, int y);
+uint16 __ovld __cnfn min(uint16 x, uint y);
+long __ovld __cnfn min(long x, long y);
+ulong __ovld __cnfn min(ulong x, ulong y);
+long2 __ovld __cnfn min(long2 x, long y);
+ulong2 __ovld __cnfn min(ulong2 x, ulong y);
+long3 __ovld __cnfn min(long3 x, long y);
+ulong3 __ovld __cnfn min(ulong3 x, ulong y);
+long4 __ovld __cnfn min(long4 x, long y);
+ulong4 __ovld __cnfn min(ulong4 x, ulong y);
+long8 __ovld __cnfn min(long8 x, long y);
+ulong8 __ovld __cnfn min(ulong8 x, ulong y);
+long16 __ovld __cnfn min(long16 x, long y);
+ulong16 __ovld __cnfn min(ulong16 x, ulong y);
+
+/**
+ * Computes x * y and returns the high half of the
+ * product of x and y.
+ */
+char __ovld __cnfn mul_hi(char x, char y);
+uchar __ovld __cnfn mul_hi(uchar x, uchar y);
+char2 __ovld __cnfn mul_hi(char2 x, char2 y);
+uchar2 __ovld __cnfn mul_hi(uchar2 x, uchar2 y);
+char3 __ovld __cnfn mul_hi(char3 x, char3 y);
+uchar3 __ovld __cnfn mul_hi(uchar3 x, uchar3 y);
+char4 __ovld __cnfn mul_hi(char4 x, char4 y);
+uchar4 __ovld __cnfn mul_hi(uchar4 x, uchar4 y);
+char8 __ovld __cnfn mul_hi(char8 x, char8 y);
+uchar8 __ovld __cnfn mul_hi(uchar8 x, uchar8 y);
+char16 __ovld __cnfn mul_hi(char16 x, char16 y);
+uchar16 __ovld __cnfn mul_hi(uchar16 x, uchar16 y);
+short __ovld __cnfn mul_hi(short x, short y);
+ushort __ovld __cnfn mul_hi(ushort x, ushort y);
+short2 __ovld __cnfn mul_hi(short2 x, short2 y);
+ushort2 __ovld __cnfn mul_hi(ushort2 x, ushort2 y);
+short3 __ovld __cnfn mul_hi(short3 x, short3 y);
+ushort3 __ovld __cnfn mul_hi(ushort3 x, ushort3 y);
+short4 __ovld __cnfn mul_hi(short4 x, short4 y);
+ushort4 __ovld __cnfn mul_hi(ushort4 x, ushort4 y);
+short8 __ovld __cnfn mul_hi(short8 x, short8 y);
+ushort8 __ovld __cnfn mul_hi(ushort8 x, ushort8 y);
+short16 __ovld __cnfn mul_hi(short16 x, short16 y);
+ushort16 __ovld __cnfn mul_hi(ushort16 x, ushort16 y);
+int __ovld __cnfn mul_hi(int x, int y);
+uint __ovld __cnfn mul_hi(uint x, uint y);
+int2 __ovld __cnfn mul_hi(int2 x, int2 y);
+uint2 __ovld __cnfn mul_hi(uint2 x, uint2 y);
+int3 __ovld __cnfn mul_hi(int3 x, int3 y);
+uint3 __ovld __cnfn mul_hi(uint3 x, uint3 y);
+int4 __ovld __cnfn mul_hi(int4 x, int4 y);
+uint4 __ovld __cnfn mul_hi(uint4 x, uint4 y);
+int8 __ovld __cnfn mul_hi(int8 x, int8 y);
+uint8 __ovld __cnfn mul_hi(uint8 x, uint8 y);
+int16 __ovld __cnfn mul_hi(int16 x, int16 y);
+uint16 __ovld __cnfn mul_hi(uint16 x, uint16 y);
+long __ovld __cnfn mul_hi(long x, long y);
+ulong __ovld __cnfn mul_hi(ulong x, ulong y);
+long2 __ovld __cnfn mul_hi(long2 x, long2 y);
+ulong2 __ovld __cnfn mul_hi(ulong2 x, ulong2 y);
+long3 __ovld __cnfn mul_hi(long3 x, long3 y);
+ulong3 __ovld __cnfn mul_hi(ulong3 x, ulong3 y);
+long4 __ovld __cnfn mul_hi(long4 x, long4 y);
+ulong4 __ovld __cnfn mul_hi(ulong4 x, ulong4 y);
+long8 __ovld __cnfn mul_hi(long8 x, long8 y);
+ulong8 __ovld __cnfn mul_hi(ulong8 x, ulong8 y);
+long16 __ovld __cnfn mul_hi(long16 x, long16 y);
+ulong16 __ovld __cnfn mul_hi(ulong16 x, ulong16 y);
+
+/**
+ * For each element in v, the bits are shifted left by
+ * the number of bits given by the corresponding
+ * element in i (subject to usual shift modulo rules
+ * described in section 6.3). Bits shifted off the left
+ * side of the element are shifted back in from the
+ * right.
+ */
+char __ovld __cnfn rotate(char v, char i);
+uchar __ovld __cnfn rotate(uchar v, uchar i);
+char2 __ovld __cnfn rotate(char2 v, char2 i);
+uchar2 __ovld __cnfn rotate(uchar2 v, uchar2 i);
+char3 __ovld __cnfn rotate(char3 v, char3 i);
+uchar3 __ovld __cnfn rotate(uchar3 v, uchar3 i);
+char4 __ovld __cnfn rotate(char4 v, char4 i);
+uchar4 __ovld __cnfn rotate(uchar4 v, uchar4 i);
+char8 __ovld __cnfn rotate(char8 v, char8 i);
+uchar8 __ovld __cnfn rotate(uchar8 v, uchar8 i);
+char16 __ovld __cnfn rotate(char16 v, char16 i);
+uchar16 __ovld __cnfn rotate(uchar16 v, uchar16 i);
+short __ovld __cnfn rotate(short v, short i);
+ushort __ovld __cnfn rotate(ushort v, ushort i);
+short2 __ovld __cnfn rotate(short2 v, short2 i);
+ushort2 __ovld __cnfn rotate(ushort2 v, ushort2 i);
+short3 __ovld __cnfn rotate(short3 v, short3 i);
+ushort3 __ovld __cnfn rotate(ushort3 v, ushort3 i);
+short4 __ovld __cnfn rotate(short4 v, short4 i);
+ushort4 __ovld __cnfn rotate(ushort4 v, ushort4 i);
+short8 __ovld __cnfn rotate(short8 v, short8 i);
+ushort8 __ovld __cnfn rotate(ushort8 v, ushort8 i);
+short16 __ovld __cnfn rotate(short16 v, short16 i);
+ushort16 __ovld __cnfn rotate(ushort16 v, ushort16 i);
+int __ovld __cnfn rotate(int v, int i);
+uint __ovld __cnfn rotate(uint v, uint i);
+int2 __ovld __cnfn rotate(int2 v, int2 i);
+uint2 __ovld __cnfn rotate(uint2 v, uint2 i);
+int3 __ovld __cnfn rotate(int3 v, int3 i);
+uint3 __ovld __cnfn rotate(uint3 v, uint3 i);
+int4 __ovld __cnfn rotate(int4 v, int4 i);
+uint4 __ovld __cnfn rotate(uint4 v, uint4 i);
+int8 __ovld __cnfn rotate(int8 v, int8 i);
+uint8 __ovld __cnfn rotate(uint8 v, uint8 i);
+int16 __ovld __cnfn rotate(int16 v, int16 i);
+uint16 __ovld __cnfn rotate(uint16 v, uint16 i);
+long __ovld __cnfn rotate(long v, long i);
+ulong __ovld __cnfn rotate(ulong v, ulong i);
+long2 __ovld __cnfn rotate(long2 v, long2 i);
+ulong2 __ovld __cnfn rotate(ulong2 v, ulong2 i);
+long3 __ovld __cnfn rotate(long3 v, long3 i);
+ulong3 __ovld __cnfn rotate(ulong3 v, ulong3 i);
+long4 __ovld __cnfn rotate(long4 v, long4 i);
+ulong4 __ovld __cnfn rotate(ulong4 v, ulong4 i);
+long8 __ovld __cnfn rotate(long8 v, long8 i);
+ulong8 __ovld __cnfn rotate(ulong8 v, ulong8 i);
+long16 __ovld __cnfn rotate(long16 v, long16 i);
+ulong16 __ovld __cnfn rotate(ulong16 v, ulong16 i);
+
+/**
+ * Returns x - y and saturates the result.
+ */
+char __ovld __cnfn sub_sat(char x, char y);
+uchar __ovld __cnfn sub_sat(uchar x, uchar y);
+char2 __ovld __cnfn sub_sat(char2 x, char2 y);
+uchar2 __ovld __cnfn sub_sat(uchar2 x, uchar2 y);
+char3 __ovld __cnfn sub_sat(char3 x, char3 y);
+uchar3 __ovld __cnfn sub_sat(uchar3 x, uchar3 y);
+char4 __ovld __cnfn sub_sat(char4 x, char4 y);
+uchar4 __ovld __cnfn sub_sat(uchar4 x, uchar4 y);
+char8 __ovld __cnfn sub_sat(char8 x, char8 y);
+uchar8 __ovld __cnfn sub_sat(uchar8 x, uchar8 y);
+char16 __ovld __cnfn sub_sat(char16 x, char16 y);
+uchar16 __ovld __cnfn sub_sat(uchar16 x, uchar16 y);
+short __ovld __cnfn sub_sat(short x, short y);
+ushort __ovld __cnfn sub_sat(ushort x, ushort y);
+short2 __ovld __cnfn sub_sat(short2 x, short2 y);
+ushort2 __ovld __cnfn sub_sat(ushort2 x, ushort2 y);
+short3 __ovld __cnfn sub_sat(short3 x, short3 y);
+ushort3 __ovld __cnfn sub_sat(ushort3 x, ushort3 y);
+short4 __ovld __cnfn sub_sat(short4 x, short4 y);
+ushort4 __ovld __cnfn sub_sat(ushort4 x, ushort4 y);
+short8 __ovld __cnfn sub_sat(short8 x, short8 y);
+ushort8 __ovld __cnfn sub_sat(ushort8 x, ushort8 y);
+short16 __ovld __cnfn sub_sat(short16 x, short16 y);
+ushort16 __ovld __cnfn sub_sat(ushort16 x, ushort16 y);
+int __ovld __cnfn sub_sat(int x, int y);
+uint __ovld __cnfn sub_sat(uint x, uint y);
+int2 __ovld __cnfn sub_sat(int2 x, int2 y);
+uint2 __ovld __cnfn sub_sat(uint2 x, uint2 y);
+int3 __ovld __cnfn sub_sat(int3 x, int3 y);
+uint3 __ovld __cnfn sub_sat(uint3 x, uint3 y);
+int4 __ovld __cnfn sub_sat(int4 x, int4 y);
+uint4 __ovld __cnfn sub_sat(uint4 x, uint4 y);
+int8 __ovld __cnfn sub_sat(int8 x, int8 y);
+uint8 __ovld __cnfn sub_sat(uint8 x, uint8 y);
+int16 __ovld __cnfn sub_sat(int16 x, int16 y);
+uint16 __ovld __cnfn sub_sat(uint16 x, uint16 y);
+long __ovld __cnfn sub_sat(long x, long y);
+ulong __ovld __cnfn sub_sat(ulong x, ulong y);
+long2 __ovld __cnfn sub_sat(long2 x, long2 y);
+ulong2 __ovld __cnfn sub_sat(ulong2 x, ulong2 y);
+long3 __ovld __cnfn sub_sat(long3 x, long3 y);
+ulong3 __ovld __cnfn sub_sat(ulong3 x, ulong3 y);
+long4 __ovld __cnfn sub_sat(long4 x, long4 y);
+ulong4 __ovld __cnfn sub_sat(ulong4 x, ulong4 y);
+long8 __ovld __cnfn sub_sat(long8 x, long8 y);
+ulong8 __ovld __cnfn sub_sat(ulong8 x, ulong8 y);
+long16 __ovld __cnfn sub_sat(long16 x, long16 y);
+ulong16 __ovld __cnfn sub_sat(ulong16 x, ulong16 y);
+
+/**
+ * result[i] = ((short)hi[i] << 8) | lo[i]
+ * result[i] = ((ushort)hi[i] << 8) | lo[i]
+ */
+short __ovld __cnfn upsample(char hi, uchar lo);
+ushort __ovld __cnfn upsample(uchar hi, uchar lo);
+short2 __ovld __cnfn upsample(char2 hi, uchar2 lo);
+short3 __ovld __cnfn upsample(char3 hi, uchar3 lo);
+short4 __ovld __cnfn upsample(char4 hi, uchar4 lo);
+short8 __ovld __cnfn upsample(char8 hi, uchar8 lo);
+short16 __ovld __cnfn upsample(char16 hi, uchar16 lo);
+ushort2 __ovld __cnfn upsample(uchar2 hi, uchar2 lo);
+ushort3 __ovld __cnfn upsample(uchar3 hi, uchar3 lo);
+ushort4 __ovld __cnfn upsample(uchar4 hi, uchar4 lo);
+ushort8 __ovld __cnfn upsample(uchar8 hi, uchar8 lo);
+ushort16 __ovld __cnfn upsample(uchar16 hi, uchar16 lo);
+
+/**
+ * result[i] = ((int)hi[i] << 16) | lo[i]
+ * result[i] = ((uint)hi[i] << 16) | lo[i]
+ */
+int __ovld __cnfn upsample(short hi, ushort lo);
+uint __ovld __cnfn upsample(ushort hi, ushort lo);
+int2 __ovld __cnfn upsample(short2 hi, ushort2 lo);
+int3 __ovld __cnfn upsample(short3 hi, ushort3 lo);
+int4 __ovld __cnfn upsample(short4 hi, ushort4 lo);
+int8 __ovld __cnfn upsample(short8 hi, ushort8 lo);
+int16 __ovld __cnfn upsample(short16 hi, ushort16 lo);
+uint2 __ovld __cnfn upsample(ushort2 hi, ushort2 lo);
+uint3 __ovld __cnfn upsample(ushort3 hi, ushort3 lo);
+uint4 __ovld __cnfn upsample(ushort4 hi, ushort4 lo);
+uint8 __ovld __cnfn upsample(ushort8 hi, ushort8 lo);
+uint16 __ovld __cnfn upsample(ushort16 hi, ushort16 lo);
+/**
+ * result[i] = ((long)hi[i] << 32) | lo[i]
+ * result[i] = ((ulong)hi[i] << 32) | lo[i]
+ */
+long __ovld __cnfn upsample(int hi, uint lo);
+ulong __ovld __cnfn upsample(uint hi, uint lo);
+long2 __ovld __cnfn upsample(int2 hi, uint2 lo);
+long3 __ovld __cnfn upsample(int3 hi, uint3 lo);
+long4 __ovld __cnfn upsample(int4 hi, uint4 lo);
+long8 __ovld __cnfn upsample(int8 hi, uint8 lo);
+long16 __ovld __cnfn upsample(int16 hi, uint16 lo);
+ulong2 __ovld __cnfn upsample(uint2 hi, uint2 lo);
+ulong3 __ovld __cnfn upsample(uint3 hi, uint3 lo);
+ulong4 __ovld __cnfn upsample(uint4 hi, uint4 lo);
+ulong8 __ovld __cnfn upsample(uint8 hi, uint8 lo);
+ulong16 __ovld __cnfn upsample(uint16 hi, uint16 lo);
+
+/*
+ * popcount(x): returns the number of set bit in x
+ */
+char __ovld __cnfn popcount(char x);
+uchar __ovld __cnfn popcount(uchar x);
+char2 __ovld __cnfn popcount(char2 x);
+uchar2 __ovld __cnfn popcount(uchar2 x);
+char3 __ovld __cnfn popcount(char3 x);
+uchar3 __ovld __cnfn popcount(uchar3 x);
+char4 __ovld __cnfn popcount(char4 x);
+uchar4 __ovld __cnfn popcount(uchar4 x);
+char8 __ovld __cnfn popcount(char8 x);
+uchar8 __ovld __cnfn popcount(uchar8 x);
+char16 __ovld __cnfn popcount(char16 x);
+uchar16 __ovld __cnfn popcount(uchar16 x);
+short __ovld __cnfn popcount(short x);
+ushort __ovld __cnfn popcount(ushort x);
+short2 __ovld __cnfn popcount(short2 x);
+ushort2 __ovld __cnfn popcount(ushort2 x);
+short3 __ovld __cnfn popcount(short3 x);
+ushort3 __ovld __cnfn popcount(ushort3 x);
+short4 __ovld __cnfn popcount(short4 x);
+ushort4 __ovld __cnfn popcount(ushort4 x);
+short8 __ovld __cnfn popcount(short8 x);
+ushort8 __ovld __cnfn popcount(ushort8 x);
+short16 __ovld __cnfn popcount(short16 x);
+ushort16 __ovld __cnfn popcount(ushort16 x);
+int __ovld __cnfn popcount(int x);
+uint __ovld __cnfn popcount(uint x);
+int2 __ovld __cnfn popcount(int2 x);
+uint2 __ovld __cnfn popcount(uint2 x);
+int3 __ovld __cnfn popcount(int3 x);
+uint3 __ovld __cnfn popcount(uint3 x);
+int4 __ovld __cnfn popcount(int4 x);
+uint4 __ovld __cnfn popcount(uint4 x);
+int8 __ovld __cnfn popcount(int8 x);
+uint8 __ovld __cnfn popcount(uint8 x);
+int16 __ovld __cnfn popcount(int16 x);
+uint16 __ovld __cnfn popcount(uint16 x);
+long __ovld __cnfn popcount(long x);
+ulong __ovld __cnfn popcount(ulong x);
+long2 __ovld __cnfn popcount(long2 x);
+ulong2 __ovld __cnfn popcount(ulong2 x);
+long3 __ovld __cnfn popcount(long3 x);
+ulong3 __ovld __cnfn popcount(ulong3 x);
+long4 __ovld __cnfn popcount(long4 x);
+ulong4 __ovld __cnfn popcount(ulong4 x);
+long8 __ovld __cnfn popcount(long8 x);
+ulong8 __ovld __cnfn popcount(ulong8 x);
+long16 __ovld __cnfn popcount(long16 x);
+ulong16 __ovld __cnfn popcount(ulong16 x);
+
+/**
+ * Multiply two 24-bit integer values x and y and add
+ * the 32-bit integer result to the 32-bit integer z.
+ * Refer to definition of mul24 to see how the 24-bit
+ * integer multiplication is performed.
+ */
+int __ovld __cnfn mad24(int x, int y, int z);
+uint __ovld __cnfn mad24(uint x, uint y, uint z);
+int2 __ovld __cnfn mad24(int2 x, int2 y, int2 z);
+uint2 __ovld __cnfn mad24(uint2 x, uint2 y, uint2 z);
+int3 __ovld __cnfn mad24(int3 x, int3 y, int3 z);
+uint3 __ovld __cnfn mad24(uint3 x, uint3 y, uint3 z);
+int4 __ovld __cnfn mad24(int4 x, int4 y, int4 z);
+uint4 __ovld __cnfn mad24(uint4 x, uint4 y, uint4 z);
+int8 __ovld __cnfn mad24(int8 x, int8 y, int8 z);
+uint8 __ovld __cnfn mad24(uint8 x, uint8 y, uint8 z);
+int16 __ovld __cnfn mad24(int16 x, int16 y, int16 z);
+uint16 __ovld __cnfn mad24(uint16 x, uint16 y, uint16 z);
+
+/**
+ * Multiply two 24-bit integer values x and y. x and y
+ * are 32-bit integers but only the low 24-bits are used
+ * to perform the multiplication. mul24 should only
+ * be used when values in x and y are in the range [-
+ * 2^23, 2^23-1] if x and y are signed integers and in the
+ * range [0, 2^24-1] if x and y are unsigned integers. If
+ * x and y are not in this range, the multiplication
+ * result is implementation-defined.
+ */
+int __ovld __cnfn mul24(int x, int y);
+uint __ovld __cnfn mul24(uint x, uint y);
+int2 __ovld __cnfn mul24(int2 x, int2 y);
+uint2 __ovld __cnfn mul24(uint2 x, uint2 y);
+int3 __ovld __cnfn mul24(int3 x, int3 y);
+uint3 __ovld __cnfn mul24(uint3 x, uint3 y);
+int4 __ovld __cnfn mul24(int4 x, int4 y);
+uint4 __ovld __cnfn mul24(uint4 x, uint4 y);
+int8 __ovld __cnfn mul24(int8 x, int8 y);
+uint8 __ovld __cnfn mul24(uint8 x, uint8 y);
+int16 __ovld __cnfn mul24(int16 x, int16 y);
+uint16 __ovld __cnfn mul24(uint16 x, uint16 y);
+
+// OpenCL v1.1 s6.11.4, v1.2 s6.12.4, v2.0 s6.13.4 - Common Functions
+
+/**
+ * Returns fmin(fmax(x, minval), maxval).
+ * Results are undefined if minval > maxval.
+ */
+float __ovld __cnfn clamp(float x, float minval, float maxval);
+float2 __ovld __cnfn clamp(float2 x, float2 minval, float2 maxval);
+float3 __ovld __cnfn clamp(float3 x, float3 minval, float3 maxval);
+float4 __ovld __cnfn clamp(float4 x, float4 minval, float4 maxval);
+float8 __ovld __cnfn clamp(float8 x, float8 minval, float8 maxval);
+float16 __ovld __cnfn clamp(float16 x, float16 minval, float16 maxval);
+float2 __ovld __cnfn clamp(float2 x, float minval, float maxval);
+float3 __ovld __cnfn clamp(float3 x, float minval, float maxval);
+float4 __ovld __cnfn clamp(float4 x, float minval, float maxval);
+float8 __ovld __cnfn clamp(float8 x, float minval, float maxval);
+float16 __ovld __cnfn clamp(float16 x, float minval, float maxval);
+#ifdef cl_khr_fp64
+double __ovld __cnfn clamp(double x, double minval, double maxval);
+double2 __ovld __cnfn clamp(double2 x, double2 minval, double2 maxval);
+double3 __ovld __cnfn clamp(double3 x, double3 minval, double3 maxval);
+double4 __ovld __cnfn clamp(double4 x, double4 minval, double4 maxval);
+double8 __ovld __cnfn clamp(double8 x, double8 minval, double8 maxval);
+double16 __ovld __cnfn clamp(double16 x, double16 minval, double16 maxval);
+double2 __ovld __cnfn clamp(double2 x, double minval, double maxval);
+double3 __ovld __cnfn clamp(double3 x, double minval, double maxval);
+double4 __ovld __cnfn clamp(double4 x, double minval, double maxval);
+double8 __ovld __cnfn clamp(double8 x, double minval, double maxval);
+double16 __ovld __cnfn clamp(double16 x, double minval, double maxval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn clamp(half x, half minval, half maxval);
+half2 __ovld __cnfn clamp(half2 x, half2 minval, half2 maxval);
+half3 __ovld __cnfn clamp(half3 x, half3 minval, half3 maxval);
+half4 __ovld __cnfn clamp(half4 x, half4 minval, half4 maxval);
+half8 __ovld __cnfn clamp(half8 x, half8 minval, half8 maxval);
+half16 __ovld __cnfn clamp(half16 x, half16 minval, half16 maxval);
+half2 __ovld __cnfn clamp(half2 x, half minval, half maxval);
+half3 __ovld __cnfn clamp(half3 x, half minval, half maxval);
+half4 __ovld __cnfn clamp(half4 x, half minval, half maxval);
+half8 __ovld __cnfn clamp(half8 x, half minval, half maxval);
+half16 __ovld __cnfn clamp(half16 x, half minval, half maxval);
+#endif //cl_khr_fp16
+
+/**
+ * Converts radians to degrees, i.e. (180 / PI) *
+ * radians.
+ */
+float __ovld __cnfn degrees(float radians);
+float2 __ovld __cnfn degrees(float2 radians);
+float3 __ovld __cnfn degrees(float3 radians);
+float4 __ovld __cnfn degrees(float4 radians);
+float8 __ovld __cnfn degrees(float8 radians);
+float16 __ovld __cnfn degrees(float16 radians);
+#ifdef cl_khr_fp64
+double __ovld __cnfn degrees(double radians);
+double2 __ovld __cnfn degrees(double2 radians);
+double3 __ovld __cnfn degrees(double3 radians);
+double4 __ovld __cnfn degrees(double4 radians);
+double8 __ovld __cnfn degrees(double8 radians);
+double16 __ovld __cnfn degrees(double16 radians);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn degrees(half radians);
+half2 __ovld __cnfn degrees(half2 radians);
+half3 __ovld __cnfn degrees(half3 radians);
+half4 __ovld __cnfn degrees(half4 radians);
+half8 __ovld __cnfn degrees(half8 radians);
+half16 __ovld __cnfn degrees(half16 radians);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if x < y, otherwise it returns x. If x and y
+ * are infinite or NaN, the return values are undefined.
+ */
+float __ovld __cnfn max(float x, float y);
+float2 __ovld __cnfn max(float2 x, float2 y);
+float3 __ovld __cnfn max(float3 x, float3 y);
+float4 __ovld __cnfn max(float4 x, float4 y);
+float8 __ovld __cnfn max(float8 x, float8 y);
+float16 __ovld __cnfn max(float16 x, float16 y);
+float2 __ovld __cnfn max(float2 x, float y);
+float3 __ovld __cnfn max(float3 x, float y);
+float4 __ovld __cnfn max(float4 x, float y);
+float8 __ovld __cnfn max(float8 x, float y);
+float16 __ovld __cnfn max(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn max(double x, double y);
+double2 __ovld __cnfn max(double2 x, double2 y);
+double3 __ovld __cnfn max(double3 x, double3 y);
+double4 __ovld __cnfn max(double4 x, double4 y);
+double8 __ovld __cnfn max(double8 x, double8 y);
+double16 __ovld __cnfn max(double16 x, double16 y);
+double2 __ovld __cnfn max(double2 x, double y);
+double3 __ovld __cnfn max(double3 x, double y);
+double4 __ovld __cnfn max(double4 x, double y);
+double8 __ovld __cnfn max(double8 x, double y);
+double16 __ovld __cnfn max(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn max(half x, half y);
+half2 __ovld __cnfn max(half2 x, half2 y);
+half3 __ovld __cnfn max(half3 x, half3 y);
+half4 __ovld __cnfn max(half4 x, half4 y);
+half8 __ovld __cnfn max(half8 x, half8 y);
+half16 __ovld __cnfn max(half16 x, half16 y);
+half2 __ovld __cnfn max(half2 x, half y);
+half3 __ovld __cnfn max(half3 x, half y);
+half4 __ovld __cnfn max(half4 x, half y);
+half8 __ovld __cnfn max(half8 x, half y);
+half16 __ovld __cnfn max(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if y < x, otherwise it returns x. If x and y
+ * are infinite or NaN, the return values are undefined.
+ */
+float __ovld __cnfn min(float x, float y);
+float2 __ovld __cnfn min(float2 x, float2 y);
+float3 __ovld __cnfn min(float3 x, float3 y);
+float4 __ovld __cnfn min(float4 x, float4 y);
+float8 __ovld __cnfn min(float8 x, float8 y);
+float16 __ovld __cnfn min(float16 x, float16 y);
+float2 __ovld __cnfn min(float2 x, float y);
+float3 __ovld __cnfn min(float3 x, float y);
+float4 __ovld __cnfn min(float4 x, float y);
+float8 __ovld __cnfn min(float8 x, float y);
+float16 __ovld __cnfn min(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn min(double x, double y);
+double2 __ovld __cnfn min(double2 x, double2 y);
+double3 __ovld __cnfn min(double3 x, double3 y);
+double4 __ovld __cnfn min(double4 x, double4 y);
+double8 __ovld __cnfn min(double8 x, double8 y);
+double16 __ovld __cnfn min(double16 x, double16 y);
+double2 __ovld __cnfn min(double2 x, double y);
+double3 __ovld __cnfn min(double3 x, double y);
+double4 __ovld __cnfn min(double4 x, double y);
+double8 __ovld __cnfn min(double8 x, double y);
+double16 __ovld __cnfn min(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn min(half x, half y);
+half2 __ovld __cnfn min(half2 x, half2 y);
+half3 __ovld __cnfn min(half3 x, half3 y);
+half4 __ovld __cnfn min(half4 x, half4 y);
+half8 __ovld __cnfn min(half8 x, half8 y);
+half16 __ovld __cnfn min(half16 x, half16 y);
+half2 __ovld __cnfn min(half2 x, half y);
+half3 __ovld __cnfn min(half3 x, half y);
+half4 __ovld __cnfn min(half4 x, half y);
+half8 __ovld __cnfn min(half8 x, half y);
+half16 __ovld __cnfn min(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the linear blend of x & y implemented as:
+ * x + (y - x) * a
+ * a must be a value in the range 0.0 ... 1.0. If a is not
+ * in the range 0.0 ... 1.0, the return values are
+ * undefined.
+ */
+float __ovld __cnfn mix(float x, float y, float a);
+float2 __ovld __cnfn mix(float2 x, float2 y, float2 a);
+float3 __ovld __cnfn mix(float3 x, float3 y, float3 a);
+float4 __ovld __cnfn mix(float4 x, float4 y, float4 a);
+float8 __ovld __cnfn mix(float8 x, float8 y, float8 a);
+float16 __ovld __cnfn mix(float16 x, float16 y, float16 a);
+float2 __ovld __cnfn mix(float2 x, float2 y, float a);
+float3 __ovld __cnfn mix(float3 x, float3 y, float a);
+float4 __ovld __cnfn mix(float4 x, float4 y, float a);
+float8 __ovld __cnfn mix(float8 x, float8 y, float a);
+float16 __ovld __cnfn mix(float16 x, float16 y, float a);
+#ifdef cl_khr_fp64
+double __ovld __cnfn mix(double x, double y, double a);
+double2 __ovld __cnfn mix(double2 x, double2 y, double2 a);
+double3 __ovld __cnfn mix(double3 x, double3 y, double3 a);
+double4 __ovld __cnfn mix(double4 x, double4 y, double4 a);
+double8 __ovld __cnfn mix(double8 x, double8 y, double8 a);
+double16 __ovld __cnfn mix(double16 x, double16 y, double16 a);
+double2 __ovld __cnfn mix(double2 x, double2 y, double a);
+double3 __ovld __cnfn mix(double3 x, double3 y, double a);
+double4 __ovld __cnfn mix(double4 x, double4 y, double a);
+double8 __ovld __cnfn mix(double8 x, double8 y, double a);
+double16 __ovld __cnfn mix(double16 x, double16 y, double a);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn mix(half x, half y, half a);
+half2 __ovld __cnfn mix(half2 x, half2 y, half2 a);
+half3 __ovld __cnfn mix(half3 x, half3 y, half3 a);
+half4 __ovld __cnfn mix(half4 x, half4 y, half4 a);
+half8 __ovld __cnfn mix(half8 x, half8 y, half8 a);
+half16 __ovld __cnfn mix(half16 x, half16 y, half16 a);
+half2 __ovld __cnfn mix(half2 x, half2 y, half a);
+half3 __ovld __cnfn mix(half3 x, half3 y, half a);
+half4 __ovld __cnfn mix(half4 x, half4 y, half a);
+half8 __ovld __cnfn mix(half8 x, half8 y, half a);
+half16 __ovld __cnfn mix(half16 x, half16 y, half a);
+#endif //cl_khr_fp16
+
+/**
+ * Converts degrees to radians, i.e. (PI / 180) *
+ * degrees.
+ */
+float __ovld __cnfn radians(float degrees);
+float2 __ovld __cnfn radians(float2 degrees);
+float3 __ovld __cnfn radians(float3 degrees);
+float4 __ovld __cnfn radians(float4 degrees);
+float8 __ovld __cnfn radians(float8 degrees);
+float16 __ovld __cnfn radians(float16 degrees);
+#ifdef cl_khr_fp64
+double __ovld __cnfn radians(double degrees);
+double2 __ovld __cnfn radians(double2 degrees);
+double3 __ovld __cnfn radians(double3 degrees);
+double4 __ovld __cnfn radians(double4 degrees);
+double8 __ovld __cnfn radians(double8 degrees);
+double16 __ovld __cnfn radians(double16 degrees);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn radians(half degrees);
+half2 __ovld __cnfn radians(half2 degrees);
+half3 __ovld __cnfn radians(half3 degrees);
+half4 __ovld __cnfn radians(half4 degrees);
+half8 __ovld __cnfn radians(half8 degrees);
+half16 __ovld __cnfn radians(half16 degrees);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 0.0 if x < edge, otherwise it returns 1.0.
+ */
+float __ovld __cnfn step(float edge, float x);
+float2 __ovld __cnfn step(float2 edge, float2 x);
+float3 __ovld __cnfn step(float3 edge, float3 x);
+float4 __ovld __cnfn step(float4 edge, float4 x);
+float8 __ovld __cnfn step(float8 edge, float8 x);
+float16 __ovld __cnfn step(float16 edge, float16 x);
+float2 __ovld __cnfn step(float edge, float2 x);
+float3 __ovld __cnfn step(float edge, float3 x);
+float4 __ovld __cnfn step(float edge, float4 x);
+float8 __ovld __cnfn step(float edge, float8 x);
+float16 __ovld __cnfn step(float edge, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn step(double edge, double x);
+double2 __ovld __cnfn step(double2 edge, double2 x);
+double3 __ovld __cnfn step(double3 edge, double3 x);
+double4 __ovld __cnfn step(double4 edge, double4 x);
+double8 __ovld __cnfn step(double8 edge, double8 x);
+double16 __ovld __cnfn step(double16 edge, double16 x);
+double2 __ovld __cnfn step(double edge, double2 x);
+double3 __ovld __cnfn step(double edge, double3 x);
+double4 __ovld __cnfn step(double edge, double4 x);
+double8 __ovld __cnfn step(double edge, double8 x);
+double16 __ovld __cnfn step(double edge, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn step(half edge, half x);
+half2 __ovld __cnfn step(half2 edge, half2 x);
+half3 __ovld __cnfn step(half3 edge, half3 x);
+half4 __ovld __cnfn step(half4 edge, half4 x);
+half8 __ovld __cnfn step(half8 edge, half8 x);
+half16 __ovld __cnfn step(half16 edge, half16 x);
+half __ovld __cnfn step(half edge, half x);
+half2 __ovld __cnfn step(half edge, half2 x);
+half3 __ovld __cnfn step(half edge, half3 x);
+half4 __ovld __cnfn step(half edge, half4 x);
+half8 __ovld __cnfn step(half edge, half8 x);
+half16 __ovld __cnfn step(half edge, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 0.0 if x <= edge0 and 1.0 if x >= edge1 and
+ * performs smooth Hermite interpolation between 0
+ * and 1when edge0 < x < edge1. This is useful in
+ * cases where you would want a threshold function
+ * with a smooth transition.
+ * This is equivalent to:
+ * gentype t;
+ * t = clamp ((x - edge0) / (edge1 - edge0), 0, 1);
+ * return t * t * (3 - 2 * t);
+ * Results are undefined if edge0 >= edge1 or if x,
+ * edge0 or edge1 is a NaN.
+ */
+float __ovld __cnfn smoothstep(float edge0, float edge1, float x);
+float2 __ovld __cnfn smoothstep(float2 edge0, float2 edge1, float2 x);
+float3 __ovld __cnfn smoothstep(float3 edge0, float3 edge1, float3 x);
+float4 __ovld __cnfn smoothstep(float4 edge0, float4 edge1, float4 x);
+float8 __ovld __cnfn smoothstep(float8 edge0, float8 edge1, float8 x);
+float16 __ovld __cnfn smoothstep(float16 edge0, float16 edge1, float16 x);
+float2 __ovld __cnfn smoothstep(float edge0, float edge1, float2 x);
+float3 __ovld __cnfn smoothstep(float edge0, float edge1, float3 x);
+float4 __ovld __cnfn smoothstep(float edge0, float edge1, float4 x);
+float8 __ovld __cnfn smoothstep(float edge0, float edge1, float8 x);
+float16 __ovld __cnfn smoothstep(float edge0, float edge1, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn smoothstep(double edge0, double edge1, double x);
+double2 __ovld __cnfn smoothstep(double2 edge0, double2 edge1, double2 x);
+double3 __ovld __cnfn smoothstep(double3 edge0, double3 edge1, double3 x);
+double4 __ovld __cnfn smoothstep(double4 edge0, double4 edge1, double4 x);
+double8 __ovld __cnfn smoothstep(double8 edge0, double8 edge1, double8 x);
+double16 __ovld __cnfn smoothstep(double16 edge0, double16 edge1, double16 x);
+double2 __ovld __cnfn smoothstep(double edge0, double edge1, double2 x);
+double3 __ovld __cnfn smoothstep(double edge0, double edge1, double3 x);
+double4 __ovld __cnfn smoothstep(double edge0, double edge1, double4 x);
+double8 __ovld __cnfn smoothstep(double edge0, double edge1, double8 x);
+double16 __ovld __cnfn smoothstep(double edge0, double edge1, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
+half2 __ovld __cnfn smoothstep(half2 edge0, half2 edge1, half2 x);
+half3 __ovld __cnfn smoothstep(half3 edge0, half3 edge1, half3 x);
+half4 __ovld __cnfn smoothstep(half4 edge0, half4 edge1, half4 x);
+half8 __ovld __cnfn smoothstep(half8 edge0, half8 edge1, half8 x);
+half16 __ovld __cnfn smoothstep(half16 edge0, half16 edge1, half16 x);
+half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
+half2 __ovld __cnfn smoothstep(half edge0, half edge1, half2 x);
+half3 __ovld __cnfn smoothstep(half edge0, half edge1, half3 x);
+half4 __ovld __cnfn smoothstep(half edge0, half edge1, half4 x);
+half8 __ovld __cnfn smoothstep(half edge0, half edge1, half8 x);
+half16 __ovld __cnfn smoothstep(half edge0, half edge1, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 1.0 if x > 0, -0.0 if x = -0.0, +0.0 if x =
+ * +0.0, or -1.0 if x < 0. Returns 0.0 if x is a NaN.
+ */
+float __ovld __cnfn sign(float x);
+float2 __ovld __cnfn sign(float2 x);
+float3 __ovld __cnfn sign(float3 x);
+float4 __ovld __cnfn sign(float4 x);
+float8 __ovld __cnfn sign(float8 x);
+float16 __ovld __cnfn sign(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sign(double x);
+double2 __ovld __cnfn sign(double2 x);
+double3 __ovld __cnfn sign(double3 x);
+double4 __ovld __cnfn sign(double4 x);
+double8 __ovld __cnfn sign(double8 x);
+double16 __ovld __cnfn sign(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sign(half x);
+half2 __ovld __cnfn sign(half2 x);
+half3 __ovld __cnfn sign(half3 x);
+half4 __ovld __cnfn sign(half4 x);
+half8 __ovld __cnfn sign(half8 x);
+half16 __ovld __cnfn sign(half16 x);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.5, v1.2 s6.12.5, v2.0 s6.13.5 - Geometric Functions
+
+/**
+ * Returns the cross product of p0.xyz and p1.xyz. The
+ * w component of float4 result returned will be 0.0.
+ */
+float4 __ovld __cnfn cross(float4 p0, float4 p1);
+float3 __ovld __cnfn cross(float3 p0, float3 p1);
+#ifdef cl_khr_fp64
+double4 __ovld __cnfn cross(double4 p0, double4 p1);
+double3 __ovld __cnfn cross(double3 p0, double3 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half4 __ovld __cnfn cross(half4 p0, half4 p1);
+half3 __ovld __cnfn cross(half3 p0, half3 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Compute dot product.
+ */
+float __ovld __cnfn dot(float p0, float p1);
+float __ovld __cnfn dot(float2 p0, float2 p1);
+float __ovld __cnfn dot(float3 p0, float3 p1);
+float __ovld __cnfn dot(float4 p0, float4 p1);
+#ifdef cl_khr_fp64
+double __ovld __cnfn dot(double p0, double p1);
+double __ovld __cnfn dot(double2 p0, double2 p1);
+double __ovld __cnfn dot(double3 p0, double3 p1);
+double __ovld __cnfn dot(double4 p0, double4 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn dot(half p0, half p1);
+half __ovld __cnfn dot(half2 p0, half2 p1);
+half __ovld __cnfn dot(half3 p0, half3 p1);
+half __ovld __cnfn dot(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the distance between p0 and p1. This is
+ * calculated as length(p0 - p1).
+ */
+float __ovld __cnfn distance(float p0, float p1);
+float __ovld __cnfn distance(float2 p0, float2 p1);
+float __ovld __cnfn distance(float3 p0, float3 p1);
+float __ovld __cnfn distance(float4 p0, float4 p1);
+#ifdef cl_khr_fp64
+double __ovld __cnfn distance(double p0, double p1);
+double __ovld __cnfn distance(double2 p0, double2 p1);
+double __ovld __cnfn distance(double3 p0, double3 p1);
+double __ovld __cnfn distance(double4 p0, double4 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn distance(half p0, half p1);
+half __ovld __cnfn distance(half2 p0, half2 p1);
+half __ovld __cnfn distance(half3 p0, half3 p1);
+half __ovld __cnfn distance(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Return the length of vector p, i.e.,
+ * sqrt(p.x2 + p.y 2 + ...)
+ */
+float __ovld __cnfn length(float p);
+float __ovld __cnfn length(float2 p);
+float __ovld __cnfn length(float3 p);
+float __ovld __cnfn length(float4 p);
+#ifdef cl_khr_fp64
+double __ovld __cnfn length(double p);
+double __ovld __cnfn length(double2 p);
+double __ovld __cnfn length(double3 p);
+double __ovld __cnfn length(double4 p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn length(half p);
+half __ovld __cnfn length(half2 p);
+half __ovld __cnfn length(half3 p);
+half __ovld __cnfn length(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns a vector in the same direction as p but with a
+ * length of 1.
+ */
+float __ovld __cnfn normalize(float p);
+float2 __ovld __cnfn normalize(float2 p);
+float3 __ovld __cnfn normalize(float3 p);
+float4 __ovld __cnfn normalize(float4 p);
+#ifdef cl_khr_fp64
+double __ovld __cnfn normalize(double p);
+double2 __ovld __cnfn normalize(double2 p);
+double3 __ovld __cnfn normalize(double3 p);
+double4 __ovld __cnfn normalize(double4 p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn normalize(half p);
+half2 __ovld __cnfn normalize(half2 p);
+half3 __ovld __cnfn normalize(half3 p);
+half4 __ovld __cnfn normalize(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns fast_length(p0 - p1).
+ */
+float __ovld __cnfn fast_distance(float p0, float p1);
+float __ovld __cnfn fast_distance(float2 p0, float2 p1);
+float __ovld __cnfn fast_distance(float3 p0, float3 p1);
+float __ovld __cnfn fast_distance(float4 p0, float4 p1);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_distance(half p0, half p1);
+half __ovld __cnfn fast_distance(half2 p0, half2 p1);
+half __ovld __cnfn fast_distance(half3 p0, half3 p1);
+half __ovld __cnfn fast_distance(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the length of vector p computed as:
+ * half_sqrt(p.x2 + p.y2 + ...)
+ */
+float __ovld __cnfn fast_length(float p);
+float __ovld __cnfn fast_length(float2 p);
+float __ovld __cnfn fast_length(float3 p);
+float __ovld __cnfn fast_length(float4 p);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_length(half p);
+half __ovld __cnfn fast_length(half2 p);
+half __ovld __cnfn fast_length(half3 p);
+half __ovld __cnfn fast_length(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns a vector in the same direction as p but with a
+ * length of 1. fast_normalize is computed as:
+ * p * half_rsqrt (p.x^2 + p.y^2 + ... )
+ * The result shall be within 8192 ulps error from the
+ * infinitely precise result of
+ * if (all(p == 0.0f))
+ * result = p;
+ * else
+ * result = p / sqrt (p.x^2 + p.y^2 + ...);
+ * with the following exceptions:
+ * 1) If the sum of squares is greater than FLT_MAX
+ * then the value of the floating-point values in the
+ * result vector are undefined.
+ * 2) If the sum of squares is less than FLT_MIN then
+ * the implementation may return back p.
+ * 3) If the device is in "denorms are flushed to zero"
+ * mode, individual operand elements with magnitude
+ * less than sqrt(FLT_MIN) may be flushed to zero
+ * before proceeding with the calculation.
+ */
+float __ovld __cnfn fast_normalize(float p);
+float2 __ovld __cnfn fast_normalize(float2 p);
+float3 __ovld __cnfn fast_normalize(float3 p);
+float4 __ovld __cnfn fast_normalize(float4 p);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_normalize(half p);
+half2 __ovld __cnfn fast_normalize(half2 p);
+half3 __ovld __cnfn fast_normalize(half3 p);
+half4 __ovld __cnfn fast_normalize(half4 p);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.6, v1.2 s6.12.6, v2.0 s6.13.6 - Relational Functions
+
+/**
+ * intn isequal (floatn x, floatn y)
+ * Returns the component-wise compare of x == y.
+ */
+int __ovld __cnfn isequal(float x, float y);
+int2 __ovld __cnfn isequal(float2 x, float2 y);
+int3 __ovld __cnfn isequal(float3 x, float3 y);
+int4 __ovld __cnfn isequal(float4 x, float4 y);
+int8 __ovld __cnfn isequal(float8 x, float8 y);
+int16 __ovld __cnfn isequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isequal(double x, double y);
+long2 __ovld __cnfn isequal(double2 x, double2 y);
+long3 __ovld __cnfn isequal(double3 x, double3 y);
+long4 __ovld __cnfn isequal(double4 x, double4 y);
+long8 __ovld __cnfn isequal(double8 x, double8 y);
+long16 __ovld __cnfn isequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isequal(half x, half y);
+short2 __ovld __cnfn isequal(half2 x, half2 y);
+short3 __ovld __cnfn isequal(half3 x, half3 y);
+short4 __ovld __cnfn isequal(half4 x, half4 y);
+short8 __ovld __cnfn isequal(half8 x, half8 y);
+short16 __ovld __cnfn isequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x != y.
+ */
+int __ovld __cnfn isnotequal(float x, float y);
+int2 __ovld __cnfn isnotequal(float2 x, float2 y);
+int3 __ovld __cnfn isnotequal(float3 x, float3 y);
+int4 __ovld __cnfn isnotequal(float4 x, float4 y);
+int8 __ovld __cnfn isnotequal(float8 x, float8 y);
+int16 __ovld __cnfn isnotequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnotequal(double x, double y);
+long2 __ovld __cnfn isnotequal(double2 x, double2 y);
+long3 __ovld __cnfn isnotequal(double3 x, double3 y);
+long4 __ovld __cnfn isnotequal(double4 x, double4 y);
+long8 __ovld __cnfn isnotequal(double8 x, double8 y);
+long16 __ovld __cnfn isnotequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnotequal(half x, half y);
+short2 __ovld __cnfn isnotequal(half2 x, half2 y);
+short3 __ovld __cnfn isnotequal(half3 x, half3 y);
+short4 __ovld __cnfn isnotequal(half4 x, half4 y);
+short8 __ovld __cnfn isnotequal(half8 x, half8 y);
+short16 __ovld __cnfn isnotequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x > y.
+ */
+int __ovld __cnfn isgreater(float x, float y);
+int2 __ovld __cnfn isgreater(float2 x, float2 y);
+int3 __ovld __cnfn isgreater(float3 x, float3 y);
+int4 __ovld __cnfn isgreater(float4 x, float4 y);
+int8 __ovld __cnfn isgreater(float8 x, float8 y);
+int16 __ovld __cnfn isgreater(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isgreater(double x, double y);
+long2 __ovld __cnfn isgreater(double2 x, double2 y);
+long3 __ovld __cnfn isgreater(double3 x, double3 y);
+long4 __ovld __cnfn isgreater(double4 x, double4 y);
+long8 __ovld __cnfn isgreater(double8 x, double8 y);
+long16 __ovld __cnfn isgreater(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isgreater(half x, half y);
+short2 __ovld __cnfn isgreater(half2 x, half2 y);
+short3 __ovld __cnfn isgreater(half3 x, half3 y);
+short4 __ovld __cnfn isgreater(half4 x, half4 y);
+short8 __ovld __cnfn isgreater(half8 x, half8 y);
+short16 __ovld __cnfn isgreater(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x >= y.
+ */
+int __ovld __cnfn isgreaterequal(float x, float y);
+int2 __ovld __cnfn isgreaterequal(float2 x, float2 y);
+int3 __ovld __cnfn isgreaterequal(float3 x, float3 y);
+int4 __ovld __cnfn isgreaterequal(float4 x, float4 y);
+int8 __ovld __cnfn isgreaterequal(float8 x, float8 y);
+int16 __ovld __cnfn isgreaterequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isgreaterequal(double x, double y);
+long2 __ovld __cnfn isgreaterequal(double2 x, double2 y);
+long3 __ovld __cnfn isgreaterequal(double3 x, double3 y);
+long4 __ovld __cnfn isgreaterequal(double4 x, double4 y);
+long8 __ovld __cnfn isgreaterequal(double8 x, double8 y);
+long16 __ovld __cnfn isgreaterequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isgreaterequal(half x, half y);
+short2 __ovld __cnfn isgreaterequal(half2 x, half2 y);
+short3 __ovld __cnfn isgreaterequal(half3 x, half3 y);
+short4 __ovld __cnfn isgreaterequal(half4 x, half4 y);
+short8 __ovld __cnfn isgreaterequal(half8 x, half8 y);
+short16 __ovld __cnfn isgreaterequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x < y.
+ */
+int __ovld __cnfn isless(float x, float y);
+int2 __ovld __cnfn isless(float2 x, float2 y);
+int3 __ovld __cnfn isless(float3 x, float3 y);
+int4 __ovld __cnfn isless(float4 x, float4 y);
+int8 __ovld __cnfn isless(float8 x, float8 y);
+int16 __ovld __cnfn isless(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isless(double x, double y);
+long2 __ovld __cnfn isless(double2 x, double2 y);
+long3 __ovld __cnfn isless(double3 x, double3 y);
+long4 __ovld __cnfn isless(double4 x, double4 y);
+long8 __ovld __cnfn isless(double8 x, double8 y);
+long16 __ovld __cnfn isless(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isless(half x, half y);
+short2 __ovld __cnfn isless(half2 x, half2 y);
+short3 __ovld __cnfn isless(half3 x, half3 y);
+short4 __ovld __cnfn isless(half4 x, half4 y);
+short8 __ovld __cnfn isless(half8 x, half8 y);
+short16 __ovld __cnfn isless(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x <= y.
+ */
+int __ovld __cnfn islessequal(float x, float y);
+int2 __ovld __cnfn islessequal(float2 x, float2 y);
+int3 __ovld __cnfn islessequal(float3 x, float3 y);
+int4 __ovld __cnfn islessequal(float4 x, float4 y);
+int8 __ovld __cnfn islessequal(float8 x, float8 y);
+int16 __ovld __cnfn islessequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn islessequal(double x, double y);
+long2 __ovld __cnfn islessequal(double2 x, double2 y);
+long3 __ovld __cnfn islessequal(double3 x, double3 y);
+long4 __ovld __cnfn islessequal(double4 x, double4 y);
+long8 __ovld __cnfn islessequal(double8 x, double8 y);
+long16 __ovld __cnfn islessequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn islessequal(half x, half y);
+short2 __ovld __cnfn islessequal(half2 x, half2 y);
+short3 __ovld __cnfn islessequal(half3 x, half3 y);
+short4 __ovld __cnfn islessequal(half4 x, half4 y);
+short8 __ovld __cnfn islessequal(half8 x, half8 y);
+short16 __ovld __cnfn islessequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of
+ * (x < y) || (x > y) .
+ */
+int __ovld __cnfn islessgreater(float x, float y);
+int2 __ovld __cnfn islessgreater(float2 x, float2 y);
+int3 __ovld __cnfn islessgreater(float3 x, float3 y);
+int4 __ovld __cnfn islessgreater(float4 x, float4 y);
+int8 __ovld __cnfn islessgreater(float8 x, float8 y);
+int16 __ovld __cnfn islessgreater(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn islessgreater(double x, double y);
+long2 __ovld __cnfn islessgreater(double2 x, double2 y);
+long3 __ovld __cnfn islessgreater(double3 x, double3 y);
+long4 __ovld __cnfn islessgreater(double4 x, double4 y);
+long8 __ovld __cnfn islessgreater(double8 x, double8 y);
+long16 __ovld __cnfn islessgreater(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn islessgreater(half x, half y);
+short2 __ovld __cnfn islessgreater(half2 x, half2 y);
+short3 __ovld __cnfn islessgreater(half3 x, half3 y);
+short4 __ovld __cnfn islessgreater(half4 x, half4 y);
+short8 __ovld __cnfn islessgreater(half8 x, half8 y);
+short16 __ovld __cnfn islessgreater(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test for finite value.
+ */
+int __ovld __cnfn isfinite(float);
+int2 __ovld __cnfn isfinite(float2);
+int3 __ovld __cnfn isfinite(float3);
+int4 __ovld __cnfn isfinite(float4);
+int8 __ovld __cnfn isfinite(float8);
+int16 __ovld __cnfn isfinite(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isfinite(double);
+long2 __ovld __cnfn isfinite(double2);
+long3 __ovld __cnfn isfinite(double3);
+long4 __ovld __cnfn isfinite(double4);
+long8 __ovld __cnfn isfinite(double8);
+long16 __ovld __cnfn isfinite(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isfinite(half);
+short2 __ovld __cnfn isfinite(half2);
+short3 __ovld __cnfn isfinite(half3);
+short4 __ovld __cnfn isfinite(half4);
+short8 __ovld __cnfn isfinite(half8);
+short16 __ovld __cnfn isfinite(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for infinity value (+ve or -ve) .
+ */
+int __ovld __cnfn isinf(float);
+int2 __ovld __cnfn isinf(float2);
+int3 __ovld __cnfn isinf(float3);
+int4 __ovld __cnfn isinf(float4);
+int8 __ovld __cnfn isinf(float8);
+int16 __ovld __cnfn isinf(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isinf(double);
+long2 __ovld __cnfn isinf(double2);
+long3 __ovld __cnfn isinf(double3);
+long4 __ovld __cnfn isinf(double4);
+long8 __ovld __cnfn isinf(double8);
+long16 __ovld __cnfn isinf(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isinf(half);
+short2 __ovld __cnfn isinf(half2);
+short3 __ovld __cnfn isinf(half3);
+short4 __ovld __cnfn isinf(half4);
+short8 __ovld __cnfn isinf(half8);
+short16 __ovld __cnfn isinf(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for a NaN.
+ */
+int __ovld __cnfn isnan(float);
+int2 __ovld __cnfn isnan(float2);
+int3 __ovld __cnfn isnan(float3);
+int4 __ovld __cnfn isnan(float4);
+int8 __ovld __cnfn isnan(float8);
+int16 __ovld __cnfn isnan(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnan(double);
+long2 __ovld __cnfn isnan(double2);
+long3 __ovld __cnfn isnan(double3);
+long4 __ovld __cnfn isnan(double4);
+long8 __ovld __cnfn isnan(double8);
+long16 __ovld __cnfn isnan(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnan(half);
+short2 __ovld __cnfn isnan(half2);
+short3 __ovld __cnfn isnan(half3);
+short4 __ovld __cnfn isnan(half4);
+short8 __ovld __cnfn isnan(half8);
+short16 __ovld __cnfn isnan(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for a normal value.
+ */
+int __ovld __cnfn isnormal(float);
+int2 __ovld __cnfn isnormal(float2);
+int3 __ovld __cnfn isnormal(float3);
+int4 __ovld __cnfn isnormal(float4);
+int8 __ovld __cnfn isnormal(float8);
+int16 __ovld __cnfn isnormal(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnormal(double);
+long2 __ovld __cnfn isnormal(double2);
+long3 __ovld __cnfn isnormal(double3);
+long4 __ovld __cnfn isnormal(double4);
+long8 __ovld __cnfn isnormal(double8);
+long16 __ovld __cnfn isnormal(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnormal(half);
+short2 __ovld __cnfn isnormal(half2);
+short3 __ovld __cnfn isnormal(half3);
+short4 __ovld __cnfn isnormal(half4);
+short8 __ovld __cnfn isnormal(half8);
+short16 __ovld __cnfn isnormal(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test if arguments are ordered. isordered() takes
+ * arguments x and y, and returns the result
+ * isequal(x, x) && isequal(y, y).
+ */
+int __ovld __cnfn isordered(float x, float y);
+int2 __ovld __cnfn isordered(float2 x, float2 y);
+int3 __ovld __cnfn isordered(float3 x, float3 y);
+int4 __ovld __cnfn isordered(float4 x, float4 y);
+int8 __ovld __cnfn isordered(float8 x, float8 y);
+int16 __ovld __cnfn isordered(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isordered(double x, double y);
+long2 __ovld __cnfn isordered(double2 x, double2 y);
+long3 __ovld __cnfn isordered(double3 x, double3 y);
+long4 __ovld __cnfn isordered(double4 x, double4 y);
+long8 __ovld __cnfn isordered(double8 x, double8 y);
+long16 __ovld __cnfn isordered(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isordered(half x, half y);
+short2 __ovld __cnfn isordered(half2 x, half2 y);
+short3 __ovld __cnfn isordered(half3 x, half3 y);
+short4 __ovld __cnfn isordered(half4 x, half4 y);
+short8 __ovld __cnfn isordered(half8 x, half8 y);
+short16 __ovld __cnfn isordered(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test if arguments are unordered. isunordered()
+ * takes arguments x and y, returning non-zero if x or y
+ * is NaN, and zero otherwise.
+ */
+int __ovld __cnfn isunordered(float x, float y);
+int2 __ovld __cnfn isunordered(float2 x, float2 y);
+int3 __ovld __cnfn isunordered(float3 x, float3 y);
+int4 __ovld __cnfn isunordered(float4 x, float4 y);
+int8 __ovld __cnfn isunordered(float8 x, float8 y);
+int16 __ovld __cnfn isunordered(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isunordered(double x, double y);
+long2 __ovld __cnfn isunordered(double2 x, double2 y);
+long3 __ovld __cnfn isunordered(double3 x, double3 y);
+long4 __ovld __cnfn isunordered(double4 x, double4 y);
+long8 __ovld __cnfn isunordered(double8 x, double8 y);
+long16 __ovld __cnfn isunordered(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isunordered(half x, half y);
+short2 __ovld __cnfn isunordered(half2 x, half2 y);
+short3 __ovld __cnfn isunordered(half3 x, half3 y);
+short4 __ovld __cnfn isunordered(half4 x, half4 y);
+short8 __ovld __cnfn isunordered(half8 x, half8 y);
+short16 __ovld __cnfn isunordered(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test for sign bit. The scalar version of the function
+ * returns a 1 if the sign bit in the float is set else returns
+ * 0. The vector version of the function returns the
+ * following for each component in floatn: a -1 if the
+ * sign bit in the float is set else returns 0.
+ */
+int __ovld __cnfn signbit(float);
+int2 __ovld __cnfn signbit(float2);
+int3 __ovld __cnfn signbit(float3);
+int4 __ovld __cnfn signbit(float4);
+int8 __ovld __cnfn signbit(float8);
+int16 __ovld __cnfn signbit(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn signbit(double);
+long2 __ovld __cnfn signbit(double2);
+long3 __ovld __cnfn signbit(double3);
+long4 __ovld __cnfn signbit(double4);
+long8 __ovld __cnfn signbit(double8);
+long16 __ovld __cnfn signbit(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn signbit(half);
+short2 __ovld __cnfn signbit(half2);
+short3 __ovld __cnfn signbit(half3);
+short4 __ovld __cnfn signbit(half4);
+short8 __ovld __cnfn signbit(half8);
+short16 __ovld __cnfn signbit(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 1 if the most significant bit in any component
+ * of x is set; otherwise returns 0.
+ */
+int __ovld __cnfn any(char x);
+int __ovld __cnfn any(char2 x);
+int __ovld __cnfn any(char3 x);
+int __ovld __cnfn any(char4 x);
+int __ovld __cnfn any(char8 x);
+int __ovld __cnfn any(char16 x);
+int __ovld __cnfn any(short x);
+int __ovld __cnfn any(short2 x);
+int __ovld __cnfn any(short3 x);
+int __ovld __cnfn any(short4 x);
+int __ovld __cnfn any(short8 x);
+int __ovld __cnfn any(short16 x);
+int __ovld __cnfn any(int x);
+int __ovld __cnfn any(int2 x);
+int __ovld __cnfn any(int3 x);
+int __ovld __cnfn any(int4 x);
+int __ovld __cnfn any(int8 x);
+int __ovld __cnfn any(int16 x);
+int __ovld __cnfn any(long x);
+int __ovld __cnfn any(long2 x);
+int __ovld __cnfn any(long3 x);
+int __ovld __cnfn any(long4 x);
+int __ovld __cnfn any(long8 x);
+int __ovld __cnfn any(long16 x);
+
+/**
+ * Returns 1 if the most significant bit in all components
+ * of x is set; otherwise returns 0.
+ */
+int __ovld __cnfn all(char x);
+int __ovld __cnfn all(char2 x);
+int __ovld __cnfn all(char3 x);
+int __ovld __cnfn all(char4 x);
+int __ovld __cnfn all(char8 x);
+int __ovld __cnfn all(char16 x);
+int __ovld __cnfn all(short x);
+int __ovld __cnfn all(short2 x);
+int __ovld __cnfn all(short3 x);
+int __ovld __cnfn all(short4 x);
+int __ovld __cnfn all(short8 x);
+int __ovld __cnfn all(short16 x);
+int __ovld __cnfn all(int x);
+int __ovld __cnfn all(int2 x);
+int __ovld __cnfn all(int3 x);
+int __ovld __cnfn all(int4 x);
+int __ovld __cnfn all(int8 x);
+int __ovld __cnfn all(int16 x);
+int __ovld __cnfn all(long x);
+int __ovld __cnfn all(long2 x);
+int __ovld __cnfn all(long3 x);
+int __ovld __cnfn all(long4 x);
+int __ovld __cnfn all(long8 x);
+int __ovld __cnfn all(long16 x);
+
+/**
+ * Each bit of the result is the corresponding bit of a if
+ * the corresponding bit of c is 0. Otherwise it is the
+ * corresponding bit of b.
+ */
+char __ovld __cnfn bitselect(char a, char b, char c);
+uchar __ovld __cnfn bitselect(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn bitselect(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn bitselect(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn bitselect(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn bitselect(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn bitselect(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn bitselect(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn bitselect(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn bitselect(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn bitselect(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn bitselect(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn bitselect(short a, short b, short c);
+ushort __ovld __cnfn bitselect(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn bitselect(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn bitselect(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn bitselect(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn bitselect(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn bitselect(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn bitselect(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn bitselect(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn bitselect(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn bitselect(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn bitselect(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn bitselect(int a, int b, int c);
+uint __ovld __cnfn bitselect(uint a, uint b, uint c);
+int2 __ovld __cnfn bitselect(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn bitselect(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn bitselect(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn bitselect(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn bitselect(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn bitselect(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn bitselect(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn bitselect(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn bitselect(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn bitselect(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn bitselect(long a, long b, long c);
+ulong __ovld __cnfn bitselect(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn bitselect(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn bitselect(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn bitselect(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn bitselect(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn bitselect(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn bitselect(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn bitselect(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn bitselect(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn bitselect(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn bitselect(ulong16 a, ulong16 b, ulong16 c);
+float __ovld __cnfn bitselect(float a, float b, float c);
+float2 __ovld __cnfn bitselect(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn bitselect(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn bitselect(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn bitselect(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn bitselect(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn bitselect(double a, double b, double c);
+double2 __ovld __cnfn bitselect(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn bitselect(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn bitselect(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn bitselect(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn bitselect(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn bitselect(half a, half b, half c);
+half2 __ovld __cnfn bitselect(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn bitselect(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn bitselect(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn bitselect(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * For each component of a vector type,
+ * result[i] = if MSB of c[i] is set ? b[i] : a[i].
+ * For a scalar type, result = c ? b : a.
+ */
+char __ovld __cnfn select(char a, char b, char c);
+uchar __ovld __cnfn select(uchar a, uchar b, char c);
+char2 __ovld __cnfn select(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, char2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, char3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, char4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c);
+short __ovld __cnfn select(short a, short b, char c);
+ushort __ovld __cnfn select(ushort a, ushort b, char c);
+short2 __ovld __cnfn select(short2 a, short2 b, char2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, char3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, char4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, char8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, char16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c);
+int __ovld __cnfn select(int a, int b, char c);
+uint __ovld __cnfn select(uint a, uint b, char c);
+int2 __ovld __cnfn select(int2 a, int2 b, char2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, char3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, char4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, char8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, char16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c);
+long __ovld __cnfn select(long a, long b, char c);
+ulong __ovld __cnfn select(ulong a, ulong b, char c);
+long2 __ovld __cnfn select(long2 a, long2 b, char2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, char3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, char4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, char8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, char16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c);
+float __ovld __cnfn select(float a, float b, char c);
+float2 __ovld __cnfn select(float2 a, float2 b, char2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, char3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, char4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, char8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, char16 c);
+char __ovld __cnfn select(char a, char b, short c);
+uchar __ovld __cnfn select(uchar a, uchar b, short c);
+char2 __ovld __cnfn select(char2 a, char2 b, short2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, short3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, short4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, short8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, short16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c);
+short __ovld __cnfn select(short a, short b, short c);
+ushort __ovld __cnfn select(ushort a, ushort b, short c);
+short2 __ovld __cnfn select(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, short2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, short3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, short4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c);
+int __ovld __cnfn select(int a, int b, short c);
+uint __ovld __cnfn select(uint a, uint b, short c);
+int2 __ovld __cnfn select(int2 a, int2 b, short2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, short3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, short4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, short8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, short16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c);
+long __ovld __cnfn select(long a, long b, short c);
+ulong __ovld __cnfn select(ulong a, ulong b, short c);
+long2 __ovld __cnfn select(long2 a, long2 b, short2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, short3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, short4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, short8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, short16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c);
+float __ovld __cnfn select(float a, float b, short c);
+float2 __ovld __cnfn select(float2 a, float2 b, short2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, short3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, short4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, short8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, short16 c);
+char __ovld __cnfn select(char a, char b, int c);
+uchar __ovld __cnfn select(uchar a, uchar b, int c);
+char2 __ovld __cnfn select(char2 a, char2 b, int2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, int3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, int4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, int8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, int16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c);
+short __ovld __cnfn select(short a, short b, int c);
+ushort __ovld __cnfn select(ushort a, ushort b, int c);
+short2 __ovld __cnfn select(short2 a, short2 b, int2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, int3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, int4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, int8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, int16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c);
+int __ovld __cnfn select(int a, int b, int c);
+uint __ovld __cnfn select(uint a, uint b, int c);
+int2 __ovld __cnfn select(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, int2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, int3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, int4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c);
+long __ovld __cnfn select(long a, long b, int c);
+ulong __ovld __cnfn select(ulong a, ulong b, int c);
+long2 __ovld __cnfn select(long2 a, long2 b, int2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, int3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, int4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, int8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, int16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c);
+float __ovld __cnfn select(float a, float b, int c);
+float2 __ovld __cnfn select(float2 a, float2 b, int2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, int3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, int4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, int8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, int16 c);
+char __ovld __cnfn select(char a, char b, long c);
+uchar __ovld __cnfn select(uchar a, uchar b, long c);
+char2 __ovld __cnfn select(char2 a, char2 b, long2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, long3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, long4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, long8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, long16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c);
+short __ovld __cnfn select(short a, short b, long c);
+ushort __ovld __cnfn select(ushort a, ushort b, long c);
+short2 __ovld __cnfn select(short2 a, short2 b, long2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, long3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, long4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, long8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, long16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c);
+int __ovld __cnfn select(int a, int b, long c);
+uint __ovld __cnfn select(uint a, uint b, long c);
+int2 __ovld __cnfn select(int2 a, int2 b, long2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, long3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, long4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, long8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, long16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c);
+long __ovld __cnfn select(long a, long b, long c);
+ulong __ovld __cnfn select(ulong a, ulong b, long c);
+long2 __ovld __cnfn select(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, long2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, long3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, long4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c);
+float __ovld __cnfn select(float a, float b, long c);
+float2 __ovld __cnfn select(float2 a, float2 b, long2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, long3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, long4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, long8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, long16 c);
+char __ovld __cnfn select(char a, char b, uchar c);
+uchar __ovld __cnfn select(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, uchar3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, uchar4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn select(short a, short b, uchar c);
+ushort __ovld __cnfn select(ushort a, ushort b, uchar c);
+short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c);
+int __ovld __cnfn select(int a, int b, uchar c);
+uint __ovld __cnfn select(uint a, uint b, uchar c);
+int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c);
+long __ovld __cnfn select(long a, long b, uchar c);
+ulong __ovld __cnfn select(ulong a, ulong b, uchar c);
+long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c);
+float __ovld __cnfn select(float a, float b, uchar c);
+float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c);
+char __ovld __cnfn select(char a, char b, ushort c);
+uchar __ovld __cnfn select(uchar a, uchar b, ushort c);
+char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c);
+short __ovld __cnfn select(short a, short b, ushort c);
+ushort __ovld __cnfn select(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, ushort3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, ushort4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn select(int a, int b, ushort c);
+uint __ovld __cnfn select(uint a, uint b, ushort c);
+int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c);
+long __ovld __cnfn select(long a, long b, ushort c);
+ulong __ovld __cnfn select(ulong a, ulong b, ushort c);
+long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c);
+float __ovld __cnfn select(float a, float b, ushort c);
+float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c);
+char __ovld __cnfn select(char a, char b, uint c);
+uchar __ovld __cnfn select(uchar a, uchar b, uint c);
+char2 __ovld __cnfn select(char2 a, char2 b, uint2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, uint3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, uint4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, uint8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, uint16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c);
+short __ovld __cnfn select(short a, short b, uint c);
+ushort __ovld __cnfn select(ushort a, ushort b, uint c);
+short2 __ovld __cnfn select(short2 a, short2 b, uint2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, uint3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, uint4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, uint8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, uint16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c);
+int __ovld __cnfn select(int a, int b, uint c);
+uint __ovld __cnfn select(uint a, uint b, uint c);
+int2 __ovld __cnfn select(int2 a, int2 b, uint2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, uint3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, uint4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, uint8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, uint16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn select(long a, long b, uint c);
+ulong __ovld __cnfn select(ulong a, ulong b, uint c);
+long2 __ovld __cnfn select(long2 a, long2 b, uint2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, uint3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, uint4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, uint8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, uint16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c);
+float __ovld __cnfn select(float a, float b, uint c);
+float2 __ovld __cnfn select(float2 a, float2 b, uint2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, uint3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, uint4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, uint8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, uint16 c);
+char __ovld __cnfn select(char a, char b, ulong c);
+uchar __ovld __cnfn select(uchar a, uchar b, ulong c);
+char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c);
+short __ovld __cnfn select(short a, short b, ulong c);
+ushort __ovld __cnfn select(ushort a, ushort b, ulong c);
+short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c);
+int __ovld __cnfn select(int a, int b, ulong c);
+uint __ovld __cnfn select(uint a, uint b, ulong c);
+int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c);
+long __ovld __cnfn select(long a, long b, ulong c);
+ulong __ovld __cnfn select(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, ulong3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, ulong4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c);
+float __ovld __cnfn select(float a, float b, ulong c);
+float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn select(double a, double b, long c);
+double2 __ovld __cnfn select(double2 a, double2 b, long2 c);
+double3 __ovld __cnfn select(double3 a, double3 b, long3 c);
+double4 __ovld __cnfn select(double4 a, double4 b, long4 c);
+double8 __ovld __cnfn select(double8 a, double8 b, long8 c);
+double16 __ovld __cnfn select(double16 a, double16 b, long16 c);
+double __ovld __cnfn select(double a, double b, ulong c);
+double2 __ovld __cnfn select(double2 a, double2 b, ulong2 c);
+double3 __ovld __cnfn select(double3 a, double3 b, ulong3 c);
+double4 __ovld __cnfn select(double4 a, double4 b, ulong4 c);
+double8 __ovld __cnfn select(double8 a, double8 b, ulong8 c);
+double16 __ovld __cnfn select(double16 a, double16 b, ulong16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn select(half a, half b, short c);
+half2 __ovld __cnfn select(half2 a, half2 b, short2 c);
+half3 __ovld __cnfn select(half3 a, half3 b, short3 c);
+half4 __ovld __cnfn select(half4 a, half4 b, short4 c);
+half8 __ovld __cnfn select(half8 a, half8 b, short8 c);
+half16 __ovld __cnfn select(half16 a, half16 b, short16 c);
+half __ovld __cnfn select(half a, half b, ushort c);
+half2 __ovld __cnfn select(half2 a, half2 b, ushort2 c);
+half3 __ovld __cnfn select(half3 a, half3 b, ushort3 c);
+half4 __ovld __cnfn select(half4 a, half4 b, ushort4 c);
+half8 __ovld __cnfn select(half8 a, half8 b, ushort8 c);
+half16 __ovld __cnfn select(half16 a, half16 b, ushort16 c);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.7, v1.2 s6.12.7, v2.0 s6.13.7 - Vector Data Load and Store Functions
+// OpenCL extensions v1.1 s9.6.6, v1.2 s9.5.6, v2.0 s9.4.6 - Vector Data Load and Store Functions for Half Type
+/**
+ * Use generic type gentype to indicate the built-in data types
+ * char, uchar, short, ushort, int, uint, long, ulong, float,
+ * double or half.
+ *
+ * vloadn return sizeof (gentypen) bytes of data read from address (p + (offset * n)).
+ *
+ * vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)).
+ *
+ * The address computed as (p + (offset * n)) must be 
+ * 8-bit aligned if gentype is char, uchar;
+ * 16-bit aligned if gentype is short, ushort, half;
+ * 32-bit aligned if gentype is int, uint, float;
+ * 64-bit aligned if gentype is long, ulong, double.
+ */
+
+char2 __ovld vload2(size_t offset, const __constant char *p);
+uchar2 __ovld vload2(size_t offset, const __constant uchar *p);
+short2 __ovld vload2(size_t offset, const __constant short *p);
+ushort2 __ovld vload2(size_t offset, const __constant ushort *p);
+int2 __ovld vload2(size_t offset, const __constant int *p);
+uint2 __ovld vload2(size_t offset, const __constant uint *p);
+long2 __ovld vload2(size_t offset, const __constant long *p);
+ulong2 __ovld vload2(size_t offset, const __constant ulong *p);
+float2 __ovld vload2(size_t offset, const __constant float *p);
+char3 __ovld vload3(size_t offset, const __constant char *p);
+uchar3 __ovld vload3(size_t offset, const __constant uchar *p);
+short3 __ovld vload3(size_t offset, const __constant short *p);
+ushort3 __ovld vload3(size_t offset, const __constant ushort *p);
+int3 __ovld vload3(size_t offset, const __constant int *p);
+uint3 __ovld vload3(size_t offset, const __constant uint *p);
+long3 __ovld vload3(size_t offset, const __constant long *p);
+ulong3 __ovld vload3(size_t offset, const __constant ulong *p);
+float3 __ovld vload3(size_t offset, const __constant float *p);
+char4 __ovld vload4(size_t offset, const __constant char *p);
+uchar4 __ovld vload4(size_t offset, const __constant uchar *p);
+short4 __ovld vload4(size_t offset, const __constant short *p);
+ushort4 __ovld vload4(size_t offset, const __constant ushort *p);
+int4 __ovld vload4(size_t offset, const __constant int *p);
+uint4 __ovld vload4(size_t offset, const __constant uint *p);
+long4 __ovld vload4(size_t offset, const __constant long *p);
+ulong4 __ovld vload4(size_t offset, const __constant ulong *p);
+float4 __ovld vload4(size_t offset, const __constant float *p);
+char8 __ovld vload8(size_t offset, const __constant char *p);
+uchar8 __ovld vload8(size_t offset, const __constant uchar *p);
+short8 __ovld vload8(size_t offset, const __constant short *p);
+ushort8 __ovld vload8(size_t offset, const __constant ushort *p);
+int8 __ovld vload8(size_t offset, const __constant int *p);
+uint8 __ovld vload8(size_t offset, const __constant uint *p);
+long8 __ovld vload8(size_t offset, const __constant long *p);
+ulong8 __ovld vload8(size_t offset, const __constant ulong *p);
+float8 __ovld vload8(size_t offset, const __constant float *p);
+char16 __ovld vload16(size_t offset, const __constant char *p);
+uchar16 __ovld vload16(size_t offset, const __constant uchar *p);
+short16 __ovld vload16(size_t offset, const __constant short *p);
+ushort16 __ovld vload16(size_t offset, const __constant ushort *p);
+int16 __ovld vload16(size_t offset, const __constant int *p);
+uint16 __ovld vload16(size_t offset, const __constant uint *p);
+long16 __ovld vload16(size_t offset, const __constant long *p);
+ulong16 __ovld vload16(size_t offset, const __constant ulong *p);
+float16 __ovld vload16(size_t offset, const __constant float *p);
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const __constant double *p);
+double3 __ovld vload3(size_t offset, const __constant double *p);
+double4 __ovld vload4(size_t offset, const __constant double *p);
+double8 __ovld vload8(size_t offset, const __constant double *p);
+double16 __ovld vload16(size_t offset, const __constant double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const __constant half *p);
+half2 __ovld vload2(size_t offset, const __constant half *p);
+half3 __ovld vload3(size_t offset, const __constant half *p);
+half4 __ovld vload4(size_t offset, const __constant half *p);
+half8 __ovld vload8(size_t offset, const __constant half *p);
+half16 __ovld vload16(size_t offset, const __constant half *p);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+char2 __ovld vload2(size_t offset, const char *p);
+uchar2 __ovld vload2(size_t offset, const uchar *p);
+short2 __ovld vload2(size_t offset, const short *p);
+ushort2 __ovld vload2(size_t offset, const ushort *p);
+int2 __ovld vload2(size_t offset, const int *p);
+uint2 __ovld vload2(size_t offset, const uint *p);
+long2 __ovld vload2(size_t offset, const long *p);
+ulong2 __ovld vload2(size_t offset, const ulong *p);
+float2 __ovld vload2(size_t offset, const float *p);
+char3 __ovld vload3(size_t offset, const char *p);
+uchar3 __ovld vload3(size_t offset, const uchar *p);
+short3 __ovld vload3(size_t offset, const short *p);
+ushort3 __ovld vload3(size_t offset, const ushort *p);
+int3 __ovld vload3(size_t offset, const int *p);
+uint3 __ovld vload3(size_t offset, const uint *p);
+long3 __ovld vload3(size_t offset, const long *p);
+ulong3 __ovld vload3(size_t offset, const ulong *p);
+float3 __ovld vload3(size_t offset, const float *p);
+char4 __ovld vload4(size_t offset, const char *p);
+uchar4 __ovld vload4(size_t offset, const uchar *p);
+short4 __ovld vload4(size_t offset, const short *p);
+ushort4 __ovld vload4(size_t offset, const ushort *p);
+int4 __ovld vload4(size_t offset, const int *p);
+uint4 __ovld vload4(size_t offset, const uint *p);
+long4 __ovld vload4(size_t offset, const long *p);
+ulong4 __ovld vload4(size_t offset, const ulong *p);
+float4 __ovld vload4(size_t offset, const float *p);
+char8 __ovld vload8(size_t offset, const char *p);
+uchar8 __ovld vload8(size_t offset, const uchar *p);
+short8 __ovld vload8(size_t offset, const short *p);
+ushort8 __ovld vload8(size_t offset, const ushort *p);
+int8 __ovld vload8(size_t offset, const int *p);
+uint8 __ovld vload8(size_t offset, const uint *p);
+long8 __ovld vload8(size_t offset, const long *p);
+ulong8 __ovld vload8(size_t offset, const ulong *p);
+float8 __ovld vload8(size_t offset, const float *p);
+char16 __ovld vload16(size_t offset, const char *p);
+uchar16 __ovld vload16(size_t offset, const uchar *p);
+short16 __ovld vload16(size_t offset, const short *p);
+ushort16 __ovld vload16(size_t offset, const ushort *p);
+int16 __ovld vload16(size_t offset, const int *p);
+uint16 __ovld vload16(size_t offset, const uint *p);
+long16 __ovld vload16(size_t offset, const long *p);
+ulong16 __ovld vload16(size_t offset, const ulong *p);
+float16 __ovld vload16(size_t offset, const float *p);
+
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const double *p);
+double3 __ovld vload3(size_t offset, const double *p);
+double4 __ovld vload4(size_t offset, const double *p);
+double8 __ovld vload8(size_t offset, const double *p);
+double16 __ovld vload16(size_t offset, const double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const half *p);
+half2 __ovld vload2(size_t offset, const half *p);
+half3 __ovld vload3(size_t offset, const half *p);
+half4 __ovld vload4(size_t offset, const half *p);
+half8 __ovld vload8(size_t offset, const half *p);
+half16 __ovld vload16(size_t offset, const half *p);
+#endif //cl_khr_fp16
+#else
+char2 __ovld vload2(size_t offset, const __global char *p);
+uchar2 __ovld vload2(size_t offset, const __global uchar *p);
+short2 __ovld vload2(size_t offset, const __global short *p);
+ushort2 __ovld vload2(size_t offset, const __global ushort *p);
+int2 __ovld vload2(size_t offset, const __global int *p);
+uint2 __ovld vload2(size_t offset, const __global uint *p);
+long2 __ovld vload2(size_t offset, const __global long *p);
+ulong2 __ovld vload2(size_t offset, const __global ulong *p);
+float2 __ovld vload2(size_t offset, const __global float *p);
+char3 __ovld vload3(size_t offset, const __global char *p);
+uchar3 __ovld vload3(size_t offset, const __global uchar *p);
+short3 __ovld vload3(size_t offset, const __global short *p);
+ushort3 __ovld vload3(size_t offset, const __global ushort *p);
+int3 __ovld vload3(size_t offset, const __global int *p);
+uint3 __ovld vload3(size_t offset, const __global uint *p);
+long3 __ovld vload3(size_t offset, const __global long *p);
+ulong3 __ovld vload3(size_t offset, const __global ulong *p);
+float3 __ovld vload3(size_t offset, const __global float *p);
+char4 __ovld vload4(size_t offset, const __global char *p);
+uchar4 __ovld vload4(size_t offset, const __global uchar *p);
+short4 __ovld vload4(size_t offset, const __global short *p);
+ushort4 __ovld vload4(size_t offset, const __global ushort *p);
+int4 __ovld vload4(size_t offset, const __global int *p);
+uint4 __ovld vload4(size_t offset, const __global uint *p);
+long4 __ovld vload4(size_t offset, const __global long *p);
+ulong4 __ovld vload4(size_t offset, const __global ulong *p);
+float4 __ovld vload4(size_t offset, const __global float *p);
+char8 __ovld vload8(size_t offset, const __global char *p);
+uchar8 __ovld vload8(size_t offset, const __global uchar *p);
+short8 __ovld vload8(size_t offset, const __global short *p);
+ushort8 __ovld vload8(size_t offset, const __global ushort *p);
+int8 __ovld vload8(size_t offset, const __global int *p);
+uint8 __ovld vload8(size_t offset, const __global uint *p);
+long8 __ovld vload8(size_t offset, const __global long *p);
+ulong8 __ovld vload8(size_t offset, const __global ulong *p);
+float8 __ovld vload8(size_t offset, const __global float *p);
+char16 __ovld vload16(size_t offset, const __global char *p);
+uchar16 __ovld vload16(size_t offset, const __global uchar *p);
+short16 __ovld vload16(size_t offset, const __global short *p);
+ushort16 __ovld vload16(size_t offset, const __global ushort *p);
+int16 __ovld vload16(size_t offset, const __global int *p);
+uint16 __ovld vload16(size_t offset, const __global uint *p);
+long16 __ovld vload16(size_t offset, const __global long *p);
+ulong16 __ovld vload16(size_t offset, const __global ulong *p);
+float16 __ovld vload16(size_t offset, const __global float *p);
+char2 __ovld vload2(size_t offset, const __local char *p);
+uchar2 __ovld vload2(size_t offset, const __local uchar *p);
+short2 __ovld vload2(size_t offset, const __local short *p);
+ushort2 __ovld vload2(size_t offset, const __local ushort *p);
+int2 __ovld vload2(size_t offset, const __local int *p);
+uint2 __ovld vload2(size_t offset, const __local uint *p);
+long2 __ovld vload2(size_t offset, const __local long *p);
+ulong2 __ovld vload2(size_t offset, const __local ulong *p);
+float2 __ovld vload2(size_t offset, const __local float *p);
+char3 __ovld vload3(size_t offset, const __local char *p);
+uchar3 __ovld vload3(size_t offset, const __local uchar *p);
+short3 __ovld vload3(size_t offset, const __local short *p);
+ushort3 __ovld vload3(size_t offset, const __local ushort *p);
+int3 __ovld vload3(size_t offset, const __local int *p);
+uint3 __ovld vload3(size_t offset, const __local uint *p);
+long3 __ovld vload3(size_t offset, const __local long *p);
+ulong3 __ovld vload3(size_t offset, const __local ulong *p);
+float3 __ovld vload3(size_t offset, const __local float *p);
+char4 __ovld vload4(size_t offset, const __local char *p);
+uchar4 __ovld vload4(size_t offset, const __local uchar *p);
+short4 __ovld vload4(size_t offset, const __local short *p);
+ushort4 __ovld vload4(size_t offset, const __local ushort *p);
+int4 __ovld vload4(size_t offset, const __local int *p);
+uint4 __ovld vload4(size_t offset, const __local uint *p);
+long4 __ovld vload4(size_t offset, const __local long *p);
+ulong4 __ovld vload4(size_t offset, const __local ulong *p);
+float4 __ovld vload4(size_t offset, const __local float *p);
+char8 __ovld vload8(size_t offset, const __local char *p);
+uchar8 __ovld vload8(size_t offset, const __local uchar *p);
+short8 __ovld vload8(size_t offset, const __local short *p);
+ushort8 __ovld vload8(size_t offset, const __local ushort *p);
+int8 __ovld vload8(size_t offset, const __local int *p);
+uint8 __ovld vload8(size_t offset, const __local uint *p);
+long8 __ovld vload8(size_t offset, const __local long *p);
+ulong8 __ovld vload8(size_t offset, const __local ulong *p);
+float8 __ovld vload8(size_t offset, const __local float *p);
+char16 __ovld vload16(size_t offset, const __local char *p);
+uchar16 __ovld vload16(size_t offset, const __local uchar *p);
+short16 __ovld vload16(size_t offset, const __local short *p);
+ushort16 __ovld vload16(size_t offset, const __local ushort *p);
+int16 __ovld vload16(size_t offset, const __local int *p);
+uint16 __ovld vload16(size_t offset, const __local uint *p);
+long16 __ovld vload16(size_t offset, const __local long *p);
+ulong16 __ovld vload16(size_t offset, const __local ulong *p);
+float16 __ovld vload16(size_t offset, const __local float *p);
+char2 __ovld vload2(size_t offset, const __private char *p);
+uchar2 __ovld vload2(size_t offset, const __private uchar *p);
+short2 __ovld vload2(size_t offset, const __private short *p);
+ushort2 __ovld vload2(size_t offset, const __private ushort *p);
+int2 __ovld vload2(size_t offset, const __private int *p);
+uint2 __ovld vload2(size_t offset, const __private uint *p);
+long2 __ovld vload2(size_t offset, const __private long *p);
+ulong2 __ovld vload2(size_t offset, const __private ulong *p);
+float2 __ovld vload2(size_t offset, const __private float *p);
+char3 __ovld vload3(size_t offset, const __private char *p);
+uchar3 __ovld vload3(size_t offset, const __private uchar *p);
+short3 __ovld vload3(size_t offset, const __private short *p);
+ushort3 __ovld vload3(size_t offset, const __private ushort *p);
+int3 __ovld vload3(size_t offset, const __private int *p);
+uint3 __ovld vload3(size_t offset, const __private uint *p);
+long3 __ovld vload3(size_t offset, const __private long *p);
+ulong3 __ovld vload3(size_t offset, const __private ulong *p);
+float3 __ovld vload3(size_t offset, const __private float *p);
+char4 __ovld vload4(size_t offset, const __private char *p);
+uchar4 __ovld vload4(size_t offset, const __private uchar *p);
+short4 __ovld vload4(size_t offset, const __private short *p);
+ushort4 __ovld vload4(size_t offset, const __private ushort *p);
+int4 __ovld vload4(size_t offset, const __private int *p);
+uint4 __ovld vload4(size_t offset, const __private uint *p);
+long4 __ovld vload4(size_t offset, const __private long *p);
+ulong4 __ovld vload4(size_t offset, const __private ulong *p);
+float4 __ovld vload4(size_t offset, const __private float *p);
+char8 __ovld vload8(size_t offset, const __private char *p);
+uchar8 __ovld vload8(size_t offset, const __private uchar *p);
+short8 __ovld vload8(size_t offset, const __private short *p);
+ushort8 __ovld vload8(size_t offset, const __private ushort *p);
+int8 __ovld vload8(size_t offset, const __private int *p);
+uint8 __ovld vload8(size_t offset, const __private uint *p);
+long8 __ovld vload8(size_t offset, const __private long *p);
+ulong8 __ovld vload8(size_t offset, const __private ulong *p);
+float8 __ovld vload8(size_t offset, const __private float *p);
+char16 __ovld vload16(size_t offset, const __private char *p);
+uchar16 __ovld vload16(size_t offset, const __private uchar *p);
+short16 __ovld vload16(size_t offset, const __private short *p);
+ushort16 __ovld vload16(size_t offset, const __private ushort *p);
+int16 __ovld vload16(size_t offset, const __private int *p);
+uint16 __ovld vload16(size_t offset, const __private uint *p);
+long16 __ovld vload16(size_t offset, const __private long *p);
+ulong16 __ovld vload16(size_t offset, const __private ulong *p);
+float16 __ovld vload16(size_t offset, const __private float *p);
+
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const __global double *p);
+double3 __ovld vload3(size_t offset, const __global double *p);
+double4 __ovld vload4(size_t offset, const __global double *p);
+double8 __ovld vload8(size_t offset, const __global double *p);
+double16 __ovld vload16(size_t offset, const __global double *p);
+double2 __ovld vload2(size_t offset, const __local double *p);
+double3 __ovld vload3(size_t offset, const __local double *p);
+double4 __ovld vload4(size_t offset, const __local double *p);
+double8 __ovld vload8(size_t offset, const __local double *p);
+double16 __ovld vload16(size_t offset, const __local double *p);
+double2 __ovld vload2(size_t offset, const __private double *p);
+double3 __ovld vload3(size_t offset, const __private double *p);
+double4 __ovld vload4(size_t offset, const __private double *p);
+double8 __ovld vload8(size_t offset, const __private double *p);
+double16 __ovld vload16(size_t offset, const __private double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const __global half *p);
+half2 __ovld vload2(size_t offset, const __global half *p);
+half3 __ovld vload3(size_t offset, const __global half *p);
+half4 __ovld vload4(size_t offset, const __global half *p);
+half8 __ovld vload8(size_t offset, const __global half *p);
+half16 __ovld vload16(size_t offset, const __global half *p);
+half __ovld vload(size_t offset, const __local half *p);
+half2 __ovld vload2(size_t offset, const __local half *p);
+half3 __ovld vload3(size_t offset, const __local half *p);
+half4 __ovld vload4(size_t offset, const __local half *p);
+half8 __ovld vload8(size_t offset, const __local half *p);
+half16 __ovld vload16(size_t offset, const __local half *p);
+half __ovld vload(size_t offset, const __private half *p);
+half2 __ovld vload2(size_t offset, const __private half *p);
+half3 __ovld vload3(size_t offset, const __private half *p);
+half4 __ovld vload4(size_t offset, const __private half *p);
+half8 __ovld vload8(size_t offset, const __private half *p);
+half16 __ovld vload16(size_t offset, const __private half *p);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore2(char2 data, size_t offset, char *p);
+void __ovld vstore2(uchar2 data, size_t offset, uchar *p);
+void __ovld vstore2(short2 data, size_t offset, short *p);
+void __ovld vstore2(ushort2 data, size_t offset, ushort *p);
+void __ovld vstore2(int2 data, size_t offset, int *p);
+void __ovld vstore2(uint2 data, size_t offset, uint *p);
+void __ovld vstore2(long2 data, size_t offset, long *p);
+void __ovld vstore2(ulong2 data, size_t offset, ulong *p);
+void __ovld vstore2(float2 data, size_t offset, float *p);
+void __ovld vstore3(char3 data, size_t offset, char *p);
+void __ovld vstore3(uchar3 data, size_t offset, uchar *p);
+void __ovld vstore3(short3 data, size_t offset, short *p);
+void __ovld vstore3(ushort3 data, size_t offset, ushort *p);
+void __ovld vstore3(int3 data, size_t offset, int *p);
+void __ovld vstore3(uint3 data, size_t offset, uint *p);
+void __ovld vstore3(long3 data, size_t offset, long *p);
+void __ovld vstore3(ulong3 data, size_t offset, ulong *p);
+void __ovld vstore3(float3 data, size_t offset, float *p);
+void __ovld vstore4(char4 data, size_t offset, char *p);
+void __ovld vstore4(uchar4 data, size_t offset, uchar *p);
+void __ovld vstore4(short4 data, size_t offset, short *p);
+void __ovld vstore4(ushort4 data, size_t offset, ushort *p);
+void __ovld vstore4(int4 data, size_t offset, int *p);
+void __ovld vstore4(uint4 data, size_t offset, uint *p);
+void __ovld vstore4(long4 data, size_t offset, long *p);
+void __ovld vstore4(ulong4 data, size_t offset, ulong *p);
+void __ovld vstore4(float4 data, size_t offset, float *p);
+void __ovld vstore8(char8 data, size_t offset, char *p);
+void __ovld vstore8(uchar8 data, size_t offset, uchar *p);
+void __ovld vstore8(short8 data, size_t offset, short *p);
+void __ovld vstore8(ushort8 data, size_t offset, ushort *p);
+void __ovld vstore8(int8 data, size_t offset, int *p);
+void __ovld vstore8(uint8 data, size_t offset, uint *p);
+void __ovld vstore8(long8 data, size_t offset, long *p);
+void __ovld vstore8(ulong8 data, size_t offset, ulong *p);
+void __ovld vstore8(float8 data, size_t offset, float *p);
+void __ovld vstore16(char16 data, size_t offset, char *p);
+void __ovld vstore16(uchar16 data, size_t offset, uchar *p);
+void __ovld vstore16(short16 data, size_t offset, short *p);
+void __ovld vstore16(ushort16 data, size_t offset, ushort *p);
+void __ovld vstore16(int16 data, size_t offset, int *p);
+void __ovld vstore16(uint16 data, size_t offset, uint *p);
+void __ovld vstore16(long16 data, size_t offset, long *p);
+void __ovld vstore16(ulong16 data, size_t offset, ulong *p);
+void __ovld vstore16(float16 data, size_t offset, float *p);
+#ifdef cl_khr_fp64
+void __ovld vstore2(double2 data, size_t offset, double *p);
+void __ovld vstore3(double3 data, size_t offset, double *p);
+void __ovld vstore4(double4 data, size_t offset, double *p);
+void __ovld vstore8(double8 data, size_t offset, double *p);
+void __ovld vstore16(double16 data, size_t offset, double *p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld vstore(half data, size_t offset, half *p);
+void __ovld vstore2(half2 data, size_t offset, half *p);
+void __ovld vstore3(half3 data, size_t offset, half *p);
+void __ovld vstore4(half4 data, size_t offset, half *p);
+void __ovld vstore8(half8 data, size_t offset, half *p);
+void __ovld vstore16(half16 data, size_t offset, half *p);
+#endif //cl_khr_fp16
+#else
+void __ovld vstore2(char2 data, size_t offset, __global char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __global uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __global short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __global ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __global int *p);
+void __ovld vstore2(uint2 data, size_t offset, __global uint *p);
+void __ovld vstore2(long2 data, size_t offset, __global long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __global ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __global float *p);
+void __ovld vstore3(char3 data, size_t offset, __global char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __global uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __global short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __global ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __global int *p);
+void __ovld vstore3(uint3 data, size_t offset, __global uint *p);
+void __ovld vstore3(long3 data, size_t offset, __global long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __global ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __global float *p);
+void __ovld vstore4(char4 data, size_t offset, __global char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __global uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __global short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __global ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __global int *p);
+void __ovld vstore4(uint4 data, size_t offset, __global uint *p);
+void __ovld vstore4(long4 data, size_t offset, __global long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __global ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __global float *p);
+void __ovld vstore8(char8 data, size_t offset, __global char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __global uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __global short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __global ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __global int *p);
+void __ovld vstore8(uint8 data, size_t offset, __global uint *p);
+void __ovld vstore8(long8 data, size_t offset, __global long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __global ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __global float *p);
+void __ovld vstore16(char16 data, size_t offset, __global char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __global uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __global short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __global ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __global int *p);
+void __ovld vstore16(uint16 data, size_t offset, __global uint *p);
+void __ovld vstore16(long16 data, size_t offset, __global long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __global ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __global float *p);
+void __ovld vstore2(char2 data, size_t offset, __local char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __local uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __local short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __local ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __local int *p);
+void __ovld vstore2(uint2 data, size_t offset, __local uint *p);
+void __ovld vstore2(long2 data, size_t offset, __local long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __local ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __local float *p);
+void __ovld vstore3(char3 data, size_t offset, __local char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __local uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __local short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __local ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __local int *p);
+void __ovld vstore3(uint3 data, size_t offset, __local uint *p);
+void __ovld vstore3(long3 data, size_t offset, __local long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __local ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __local float *p);
+void __ovld vstore4(char4 data, size_t offset, __local char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __local uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __local short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __local ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __local int *p);
+void __ovld vstore4(uint4 data, size_t offset, __local uint *p);
+void __ovld vstore4(long4 data, size_t offset, __local long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __local ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __local float *p);
+void __ovld vstore8(char8 data, size_t offset, __local char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __local uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __local short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __local ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __local int *p);
+void __ovld vstore8(uint8 data, size_t offset, __local uint *p);
+void __ovld vstore8(long8 data, size_t offset, __local long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __local ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __local float *p);
+void __ovld vstore16(char16 data, size_t offset, __local char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __local uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __local short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __local ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __local int *p);
+void __ovld vstore16(uint16 data, size_t offset, __local uint *p);
+void __ovld vstore16(long16 data, size_t offset, __local long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __local ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __local float *p);
+void __ovld vstore2(char2 data, size_t offset, __private char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __private uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __private short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __private ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __private int *p);
+void __ovld vstore2(uint2 data, size_t offset, __private uint *p);
+void __ovld vstore2(long2 data, size_t offset, __private long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __private ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __private float *p);
+void __ovld vstore3(char3 data, size_t offset, __private char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __private uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __private short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __private ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __private int *p);
+void __ovld vstore3(uint3 data, size_t offset, __private uint *p);
+void __ovld vstore3(long3 data, size_t offset, __private long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __private ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __private float *p);
+void __ovld vstore4(char4 data, size_t offset, __private char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __private uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __private short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __private ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __private int *p);
+void __ovld vstore4(uint4 data, size_t offset, __private uint *p);
+void __ovld vstore4(long4 data, size_t offset, __private long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __private ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __private float *p);
+void __ovld vstore8(char8 data, size_t offset, __private char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __private uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __private short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __private ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __private int *p);
+void __ovld vstore8(uint8 data, size_t offset, __private uint *p);
+void __ovld vstore8(long8 data, size_t offset, __private long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __private ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __private float *p);
+void __ovld vstore16(char16 data, size_t offset, __private char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __private uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __private short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __private ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __private int *p);
+void __ovld vstore16(uint16 data, size_t offset, __private uint *p);
+void __ovld vstore16(long16 data, size_t offset, __private long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __private ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __private float *p);
+#ifdef cl_khr_fp64
+void __ovld vstore2(double2 data, size_t offset, __global double *p);
+void __ovld vstore3(double3 data, size_t offset, __global double *p);
+void __ovld vstore4(double4 data, size_t offset, __global double *p);
+void __ovld vstore8(double8 data, size_t offset, __global double *p);
+void __ovld vstore16(double16 data, size_t offset, __global double *p);
+void __ovld vstore2(double2 data, size_t offset, __local double *p);
+void __ovld vstore3(double3 data, size_t offset, __local double *p);
+void __ovld vstore4(double4 data, size_t offset, __local double *p);
+void __ovld vstore8(double8 data, size_t offset, __local double *p);
+void __ovld vstore16(double16 data, size_t offset, __local double *p);
+void __ovld vstore2(double2 data, size_t offset, __private double *p);
+void __ovld vstore3(double3 data, size_t offset, __private double *p);
+void __ovld vstore4(double4 data, size_t offset, __private double *p);
+void __ovld vstore8(double8 data, size_t offset, __private double *p);
+void __ovld vstore16(double16 data, size_t offset, __private double *p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld vstore(half data, size_t offset, __global half *p);
+void __ovld vstore2(half2 data, size_t offset, __global half *p);
+void __ovld vstore3(half3 data, size_t offset, __global half *p);
+void __ovld vstore4(half4 data, size_t offset, __global half *p);
+void __ovld vstore8(half8 data, size_t offset, __global half *p);
+void __ovld vstore16(half16 data, size_t offset, __global half *p);
+void __ovld vstore(half data, size_t offset, __local half *p);
+void __ovld vstore2(half2 data, size_t offset, __local half *p);
+void __ovld vstore3(half3 data, size_t offset, __local half *p);
+void __ovld vstore4(half4 data, size_t offset, __local half *p);
+void __ovld vstore8(half8 data, size_t offset, __local half *p);
+void __ovld vstore16(half16 data, size_t offset, __local half *p);
+void __ovld vstore(half data, size_t offset, __private half *p);
+void __ovld vstore2(half2 data, size_t offset, __private half *p);
+void __ovld vstore3(half3 data, size_t offset, __private half *p);
+void __ovld vstore4(half4 data, size_t offset, __private half *p);
+void __ovld vstore8(half8 data, size_t offset, __private half *p);
+void __ovld vstore16(half16 data, size_t offset, __private half *p);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Read sizeof (half) bytes of data from address
+ * (p + offset). The data read is interpreted as a
+ * half value. The half value is converted to a
+ * float value and the float value is returned.
+ * The read address computed as (p + offset)
+ * must be 16-bit aligned.
+ */
+float __ovld vload_half(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld vload_half(size_t offset, const half *p);
+#else
+float __ovld vload_half(size_t offset, const __global half *p);
+float __ovld vload_half(size_t offset, const __local half *p);
+float __ovld vload_half(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Read sizeof (halfn) bytes of data from address
+ * (p + (offset * n)). The data read is interpreted
+ * as a halfn value. The halfn value read is
+ * converted to a floatn value and the floatn
+ * value is returned. The read address computed
+ * as (p + (offset * n)) must be 16-bit aligned.
+ */
+float2 __ovld vload_half2(size_t offset, const __constant half *p);
+float3 __ovld vload_half3(size_t offset, const __constant half *p);
+float4 __ovld vload_half4(size_t offset, const __constant half *p);
+float8 __ovld vload_half8(size_t offset, const __constant half *p);
+float16 __ovld vload_half16(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float2 __ovld vload_half2(size_t offset, const half *p);
+float3 __ovld vload_half3(size_t offset, const half *p);
+float4 __ovld vload_half4(size_t offset, const half *p);
+float8 __ovld vload_half8(size_t offset, const half *p);
+float16 __ovld vload_half16(size_t offset, const half *p);
+#else
+float2 __ovld vload_half2(size_t offset, const __global half *p);
+float3 __ovld vload_half3(size_t offset, const __global half *p);
+float4 __ovld vload_half4(size_t offset, const __global half *p);
+float8 __ovld vload_half8(size_t offset, const __global half *p);
+float16 __ovld vload_half16(size_t offset, const __global half *p);
+float2 __ovld vload_half2(size_t offset, const __local half *p);
+float3 __ovld vload_half3(size_t offset, const __local half *p);
+float4 __ovld vload_half4(size_t offset, const __local half *p);
+float8 __ovld vload_half8(size_t offset, const __local half *p);
+float16 __ovld vload_half16(size_t offset, const __local half *p);
+float2 __ovld vload_half2(size_t offset, const __private half *p);
+float3 __ovld vload_half3(size_t offset, const __private half *p);
+float4 __ovld vload_half4(size_t offset, const __private half *p);
+float8 __ovld vload_half8(size_t offset, const __private half *p);
+float16 __ovld vload_half16(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The float value given by data is first
+ * converted to a half value using the appropriate
+ * rounding mode. The half value is then written
+ * to address computed as (p + offset). The
+ * address computed as (p + offset) must be 16-
+ * bit aligned.
+ * vstore_half use the current rounding mode.
+ * The default current rounding mode is round to
+ * nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore_half(float data, size_t offset, half *p);
+void __ovld vstore_half_rte(float data, size_t offset, half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half(double data, size_t offset, half *p);
+void __ovld vstore_half_rte(double data, size_t offset, half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, half *p);
+#endif //cl_khr_fp64
+#else
+void __ovld vstore_half(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __global half *p);
+void __ovld vstore_half(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __local half *p);
+void __ovld vstore_half(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __private half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __global half *p);
+void __ovld vstore_half(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __local half *p);
+void __ovld vstore_half(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The floatn value given by data is converted to
+ * a halfn value using the appropriate rounding
+ * mode. The halfn value is then written to
+ * address computed as (p + (offset * n)). The
+ * address computed as (p + (offset * n)) must be
+ * 16-bit aligned.
+ * vstore_halfn uses the current rounding mode.
+ * The default current rounding mode is round to
+ * nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore_half2(float2 data, size_t offset, half *p);
+void __ovld vstore_half3(float3 data, size_t offset, half *p);
+void __ovld vstore_half4(float4 data, size_t offset, half *p);
+void __ovld vstore_half8(float8 data, size_t offset, half *p);
+void __ovld vstore_half16(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half2(double2 data, size_t offset, half *p);
+void __ovld vstore_half3(double3 data, size_t offset, half *p);
+void __ovld vstore_half4(double4 data, size_t offset, half *p);
+void __ovld vstore_half8(double8 data, size_t offset, half *p);
+void __ovld vstore_half16(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, half *p);
+#endif //cl_khr_fp64
+#else
+void __ovld vstore_half2(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __private half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half2(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * For n = 1, 2, 4, 8 and 16 read sizeof (halfn)
+ * bytes of data from address (p + (offset * n)).
+ * The data read is interpreted as a halfn value.
+ * The halfn value read is converted to a floatn
+ * value and the floatn value is returned.
+ * The address computed as (p + (offset * n))
+ * must be aligned to sizeof (halfn) bytes.
+ * For n = 3, vloada_half3 reads a half3 from
+ * address (p + (offset * 4)) and returns a float3.
+ * The address computed as (p + (offset * 4))
+ * must be aligned to sizeof (half) * 4 bytes.
+ */
+float __ovld vloada_half(size_t offset, const __constant half *p);
+float2 __ovld vloada_half2(size_t offset, const __constant half *p);
+float3 __ovld vloada_half3(size_t offset, const __constant half *p);
+float4 __ovld vloada_half4(size_t offset, const __constant half *p);
+float8 __ovld vloada_half8(size_t offset, const __constant half *p);
+float16 __ovld vloada_half16(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld vloada_half(size_t offset, const half *p);
+float2 __ovld vloada_half2(size_t offset, const half *p);
+float3 __ovld vloada_half3(size_t offset, const half *p);
+float4 __ovld vloada_half4(size_t offset, const half *p);
+float8 __ovld vloada_half8(size_t offset, const half *p);
+float16 __ovld vloada_half16(size_t offset, const half *p);
+#else
+float __ovld vloada_half(size_t offset, const __global half *p);
+float2 __ovld vloada_half2(size_t offset, const __global half *p);
+float3 __ovld vloada_half3(size_t offset, const __global half *p);
+float4 __ovld vloada_half4(size_t offset, const __global half *p);
+float8 __ovld vloada_half8(size_t offset, const __global half *p);
+float16 __ovld vloada_half16(size_t offset, const __global half *p);
+float __ovld vloada_half(size_t offset, const __local half *p);
+float2 __ovld vloada_half2(size_t offset, const __local half *p);
+float3 __ovld vloada_half3(size_t offset, const __local half *p);
+float4 __ovld vloada_half4(size_t offset, const __local half *p);
+float8 __ovld vloada_half8(size_t offset, const __local half *p);
+float16 __ovld vloada_half16(size_t offset, const __local half *p);
+float __ovld vloada_half(size_t offset, const __private half *p);
+float2 __ovld vloada_half2(size_t offset, const __private half *p);
+float3 __ovld vloada_half3(size_t offset, const __private half *p);
+float4 __ovld vloada_half4(size_t offset, const __private half *p);
+float8 __ovld vloada_half8(size_t offset, const __private half *p);
+float16 __ovld vloada_half16(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The floatn value given by data is converted to
+ * a halfn value using the appropriate rounding
+ * mode.
+ * For n = 1, 2, 4, 8 and 16, the halfn value is
+ * written to the address computed as (p + (offset
+ * * n)). The address computed as (p + (offset *
+ * n)) must be aligned to sizeof (halfn) bytes.
+ * For n = 3, the half3 value is written to the
+ * address computed as (p + (offset * 4)). The
+ * address computed as (p + (offset * 4)) must be
+ * aligned to sizeof (half) * 4 bytes.
+ * vstorea_halfn uses the current rounding
+ * mode. The default current rounding mode is
+ * round to nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstorea_half(float data, size_t offset, half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, half *p);
+
+#ifdef cl_khr_fp64
+void __ovld vstorea_half(double data, size_t offset, half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, half *p);
+#endif //cl_khr_fp64
+
+#else
+void __ovld vstorea_half(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __private half *p);
+
+#ifdef cl_khr_fp64
+void __ovld vstorea_half(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtn(double2 data,size_t offset, __private half *p);
+void __ovld vstorea_half3_rtn(double3 data,size_t offset, __private half *p);
+void __ovld vstorea_half4_rtn(double4 data,size_t offset, __private half *p);
+void __ovld vstorea_half8_rtn(double8 data,size_t offset, __private half *p);
+void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
+
+// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
+typedef uint cl_mem_fence_flags;
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to local memory
+ */
+#define CLK_LOCAL_MEM_FENCE    0x01
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to global memory
+ */
+#define CLK_GLOBAL_MEM_FENCE   0x02
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+/**
+ * Queue a memory fence to ensure correct ordering of memory
+ * operations between work-items of a work-group to
+ * image memory.
+ */
+#define CLK_IMAGE_MEM_FENCE  0x04
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * All work-items in a work-group executing the kernel
+ * on a processor must execute this function before any
+ * are allowed to continue execution beyond the barrier.
+ * This function must be encountered by all work-items in
+ * a work-group executing the kernel.
+ * If barrier is inside a conditional statement, then all
+ * work-items must enter the conditional if any work-item
+ * enters the conditional statement and executes the
+ * barrier.
+ * If barrer is inside a loop, all work-items must execute
+ * the barrier for each iteration of the loop before any are
+ * allowed to continue execution beyond the barrier.
+ * The barrier function also queues a memory fence
+ * (reads and writes) to ensure correct ordering of
+ * memory operations to local or global memory.
+ * The flags argument specifies the memory address space
+ * and can be set to a combination of the following literal
+ * values.
+ * CLK_LOCAL_MEM_FENCE - The barrier function
+ * will either flush any variables stored in local memory
+ * or queue a memory fence to ensure correct ordering of
+ * memory operations to local memory.
+ * CLK_GLOBAL_MEM_FENCE - The barrier function
+ * will queue a memory fence to ensure correct ordering
+ * of memory operations to global memory. This can be
+ * useful when work-items, for example, write to buffer or
+ * image objects and then want to read the updated data.
+ */
+
+void __ovld __conv barrier(cl_mem_fence_flags flags);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+typedef enum memory_scope
+{
+  memory_scope_work_item,
+  memory_scope_work_group,
+  memory_scope_device,
+  memory_scope_all_svm_devices,
+  memory_scope_sub_group
+} memory_scope;
+
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
+
+/**
+ * Orders loads and stores of a work-item
+ * executing a kernel. This means that loads
+ * and stores preceding the mem_fence will
+ * be committed to memory before any loads
+ * and stores following the mem_fence.
+ * The flags argument specifies the memory
+ * address space and can be set to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld mem_fence(cl_mem_fence_flags flags);
+
+/**
+ * Read memory barrier that orders only
+ * loads.
+ * The flags argument specifies the memory
+ * address space and can be set to to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld read_mem_fence(cl_mem_fence_flags flags);
+
+/**
+ * Write memory barrier that orders only
+ * stores.
+ * The flags argument specifies the memory
+ * address space and can be set to to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld write_mem_fence(cl_mem_fence_flags flags);
+
+// OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+cl_mem_fence_flags __ovld get_fence(const void *ptr);
+cl_mem_fence_flags __ovld get_fence(void *ptr);
+
+/** 
+ * Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions
+ * and checked in Sema since they should be declared as
+ *   addr gentype* to_addr (gentype*);
+ * where gentype is builtin type or user defined type.
+ */
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.10, v1.2 s6.12.10, v2.0 s6.13.10 - Async Copies from Global to Local Memory, Local to Global Memory, and Prefetch
+
+/**
+ * event_t async_work_group_copy (
+ * __global gentype *dst,
+ * const __local gentype *src,
+ * size_t num_elements,
+ * event_t event)
+ * Perform an async copy of num_elements
+ * gentype elements from src to dst. The async
+ * copy is performed by all work-items in a workgroup
+ * and this built-in function must therefore
+ * be encountered by all work-items in a workgroup
+ * executing the kernel with the same
+ * argument values; otherwise the results are
+ * undefined.
+ * Returns an event object that can be used by
+ * wait_group_events to wait for the async copy
+ * to finish. The event argument can also be used
+ * to associate the async_work_group_copy with
+ * a previous async copy allowing an event to be
+ * shared by multiple async copies; otherwise event
+ * should be zero.
+ * If event argument is non-zero, the event object
+ * supplied in event argument will be returned.
+ * This function does not perform any implicit
+ * synchronization of source data such as using a
+ * barrier before performing the copy.
+ */
+event_t __ovld async_work_group_copy(__local char *dst, const __global char *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar *dst, const __global uchar *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short *dst, const __global short *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort *dst, const __global ushort *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int *dst, const __global int *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint *dst, const __global uint *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long *dst, const __global long *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong *dst, const __global ulong *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float *dst, const __global float *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char2 *dst, const __global char2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar2 *dst, const __global uchar2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short2 *dst, const __global short2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort2 *dst, const __global ushort2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int2 *dst, const __global int2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint2 *dst, const __global uint2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long2 *dst, const __global long2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong2 *dst, const __global ulong2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float2 *dst, const __global float2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char3 *dst, const __global char3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar3 *dst, const __global uchar3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short3 *dst, const __global short3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort3 *dst, const __global ushort3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int3 *dst, const __global int3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint3 *dst, const __global uint3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long3 *dst, const __global long3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong3 *dst, const __global ulong3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float3 *dst, const __global float3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char4 *dst, const __global char4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar4 *dst, const __global uchar4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short4 *dst, const __global short4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort4 *dst, const __global ushort4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int4 *dst, const __global int4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint4 *dst, const __global uint4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long4 *dst, const __global long4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong4 *dst, const __global ulong4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float4 *dst, const __global float4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char8 *dst, const __global char8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar8 *dst, const __global uchar8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short8 *dst, const __global short8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort8 *dst, const __global ushort8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int8 *dst, const __global int8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint8 *dst, const __global uint8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long8 *dst, const __global long8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong8 *dst, const __global ulong8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float8 *dst, const __global float8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char16 *dst, const __global char16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar16 *dst, const __global uchar16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short16 *dst, const __global short16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort16 *dst, const __global ushort16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int16 *dst, const __global int16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint16 *dst, const __global uint16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long16 *dst, const __global long16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong16 *dst, const __global ulong16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float16 *dst, const __global float16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char *dst, const __local char *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar *dst, const __local uchar *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short *dst, const __local short *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort *dst, const __local ushort *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int *dst, const __local int *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint *dst, const __local uint *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long *dst, const __local long *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong *dst, const __local ulong *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float *dst, const __local float *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char2 *dst, const __local char2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar2 *dst, const __local uchar2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short2 *dst, const __local short2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort2 *dst, const __local ushort2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int2 *dst, const __local int2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint2 *dst, const __local uint2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long2 *dst, const __local long2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong2 *dst, const __local ulong2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float2 *dst, const __local float2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char3 *dst, const __local char3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar3 *dst, const __local uchar3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short3 *dst, const __local short3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort3 *dst, const __local ushort3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int3 *dst, const __local int3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint3 *dst, const __local uint3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long3 *dst, const __local long3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong3 *dst, const __local ulong3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float3 *dst, const __local float3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char4 *dst, const __local char4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar4 *dst, const __local uchar4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short4 *dst, const __local short4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort4 *dst, const __local ushort4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int4 *dst, const __local int4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint4 *dst, const __local uint4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long4 *dst, const __local long4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong4 *dst, const __local ulong4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float4 *dst, const __local float4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char8 *dst, const __local char8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar8 *dst, const __local uchar8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short8 *dst, const __local short8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort8 *dst, const __local ushort8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int8 *dst, const __local int8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint8 *dst, const __local uint8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long8 *dst, const __local long8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong8 *dst, const __local ulong8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float8 *dst, const __local float8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char16 *dst, const __local char16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar16 *dst, const __local uchar16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short16 *dst, const __local short16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort16 *dst, const __local ushort16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int16 *dst, const __local int16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint16 *dst, const __local uint16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, event_t event);
+#ifdef cl_khr_fp64
+event_t __ovld async_work_group_copy(__local double *dst, const __global double *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double4 *dst, const __global double4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double8 *dst, const __global double8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double16 *dst, const __global double16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double *dst, const __local double *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double2 *dst, const __local double2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double3 *dst, const __local double3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, event_t event);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+event_t __ovld async_work_group_copy(__local half *dst, const __global half *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half3 *dst, const __global half3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half4 *dst, const __global half4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half8 *dst, const __global half8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half16 *dst, const __global half16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half *dst, const __local half *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half2 *dst, const __local half2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half3 *dst, const __local half3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half4 *dst, const __local half4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half8 *dst, const __local half8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half16 *dst, const __local half16 *src, size_t num_elements, event_t event);
+#endif //cl_khr_fp16
+
+/**
+ * Perform an async gather of num_elements
+ * gentype elements from src to dst. The
+ * src_stride is the stride in elements for each
+ * gentype element read from src. The dst_stride
+ * is the stride in elements for each gentype
+ * element written to dst. The async gather is
+ * performed by all work-items in a work-group.
+ * This built-in function must therefore be
+ * encountered by all work-items in a work-group
+ * executing the kernel with the same argument
+ * values; otherwise the results are undefined.
+ * Returns an event object that can be used by
+ * wait_group_events to wait for the async copy
+ * to finish. The event argument can also be used
+ * to associate the
+ * async_work_group_strided_copy with a
+ * previous async copy allowing an event to be
+ * shared by multiple async copies; otherwise event
+ * should be zero.
+ * If event argument is non-zero, the event object
+ * supplied in event argument will be returned.
+ * This function does not perform any implicit
+ * synchronization of source data such as using a
+ * barrier before performing the copy.
+ */
+event_t __ovld async_work_group_strided_copy(__local char *dst, const __global char *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar *dst, const __global uchar *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short *dst, const __global short *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort *dst, const __global ushort *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int *dst, const __global int *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint *dst, const __global uint *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long *dst, const __global long *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong *dst, const __global ulong *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float *dst, const __global float *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char2 *dst, const __global char2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar2 *dst, const __global uchar2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short2 *dst, const __global short2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort2 *dst, const __global ushort2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int2 *dst, const __global int2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint2 *dst, const __global uint2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long2 *dst, const __global long2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong2 *dst, const __global ulong2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float2 *dst, const __global float2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char3 *dst, const __global char3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar3 *dst, const __global uchar3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short3 *dst, const __global short3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort3 *dst, const __global ushort3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int3 *dst, const __global int3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint3 *dst, const __global uint3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long3 *dst, const __global long3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong3 *dst, const __global ulong3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float3 *dst, const __global float3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char4 *dst, const __global char4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar4 *dst, const __global uchar4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short4 *dst, const __global short4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort4 *dst, const __global ushort4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int4 *dst, const __global int4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint4 *dst, const __global uint4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long4 *dst, const __global long4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong4 *dst, const __global ulong4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float4 *dst, const __global float4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char8 *dst, const __global char8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar8 *dst, const __global uchar8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short8 *dst, const __global short8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort8 *dst, const __global ushort8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int8 *dst, const __global int8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint8 *dst, const __global uint8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long8 *dst, const __global long8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong8 *dst, const __global ulong8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float8 *dst, const __global float8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char16 *dst, const __global char16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar16 *dst, const __global uchar16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short16 *dst, const __global short16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort16 *dst, const __global ushort16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int16 *dst, const __global int16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint16 *dst, const __global uint16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long16 *dst, const __global long16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong16 *dst, const __global ulong16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float16 *dst, const __global float16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char *dst, const __local char *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar *dst, const __local uchar *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short *dst, const __local short *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort *dst, const __local ushort *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int *dst, const __local int *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint *dst, const __local uint *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long *dst, const __local long *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong *dst, const __local ulong *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float *dst, const __local float *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char2 *dst, const __local char2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar2 *dst, const __local uchar2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short2 *dst, const __local short2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort2 *dst, const __local ushort2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int2 *dst, const __local int2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint2 *dst, const __local uint2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long2 *dst, const __local long2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong2 *dst, const __local ulong2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float2 *dst, const __local float2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char3 *dst, const __local char3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar3 *dst, const __local uchar3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short3 *dst, const __local short3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort3 *dst, const __local ushort3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int3 *dst, const __local int3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint3 *dst, const __local uint3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long3 *dst, const __local long3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong3 *dst, const __local ulong3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float3 *dst, const __local float3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char4 *dst, const __local char4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar4 *dst, const __local uchar4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short4 *dst, const __local short4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort4 *dst, const __local ushort4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int4 *dst, const __local int4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint4 *dst, const __local uint4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long4 *dst, const __local long4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong4 *dst, const __local ulong4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float4 *dst, const __local float4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char8 *dst, const __local char8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar8 *dst, const __local uchar8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short8 *dst, const __local short8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort8 *dst, const __local ushort8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int8 *dst, const __local int8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint8 *dst, const __local uint8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long8 *dst, const __local long8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong8 *dst, const __local ulong8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float8 *dst, const __local float8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char16 *dst, const __local char16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar16 *dst, const __local uchar16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short16 *dst, const __local short16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort16 *dst, const __local ushort16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int16 *dst, const __local int16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint16 *dst, const __local uint16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#ifdef cl_khr_fp64
+event_t __ovld async_work_group_strided_copy(__local double *dst, const __global double *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double4 *dst, const __global double4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double8 *dst, const __global double8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double16 *dst, const __global double16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double *dst, const __local double *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double2 *dst, const __local double2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double3 *dst, const __local double3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+event_t __ovld async_work_group_strided_copy(__local half *dst, const __global half *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half3 *dst, const __global half3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half4 *dst, const __global half4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half8 *dst, const __global half8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half16 *dst, const __global half16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half *dst, const __local half *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half2 *dst, const __local half2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half3 *dst, const __local half3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half4 *dst, const __local half4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half8 *dst, const __local half8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half16 *dst, const __local half16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#endif //cl_khr_fp16
+
+/**
+ * Wait for events that identify the
+ * async_work_group_copy operations to
+ * complete. The event objects specified in
+ * event_list will be released after the wait is
+ * performed.
+ * This function must be encountered by all workitems
+ * in a work-group executing the kernel with
+ * the same num_events and event objects specified
+ * in event_list; otherwise the results are undefined.
+ */
+void __ovld wait_group_events(int num_events, event_t *event_list);
+
+/**
+ * Prefetch num_elements * sizeof(gentype)
+ * bytes into the global cache. The prefetch
+ * instruction is applied to a work-item in a workgroup
+ * and does not affect the functional
+ * behavior of the kernel.
+ */
+void __ovld prefetch(const __global char *p, size_t num_elements);
+void __ovld prefetch(const __global uchar *p, size_t num_elements);
+void __ovld prefetch(const __global short *p, size_t num_elements);
+void __ovld prefetch(const __global ushort *p, size_t num_elements);
+void __ovld prefetch(const __global int *p, size_t num_elements);
+void __ovld prefetch(const __global uint *p, size_t num_elements);
+void __ovld prefetch(const __global long *p, size_t num_elements);
+void __ovld prefetch(const __global ulong *p, size_t num_elements);
+void __ovld prefetch(const __global float *p, size_t num_elements);
+void __ovld prefetch(const __global char2 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar2 *p, size_t num_elements);
+void __ovld prefetch(const __global short2 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort2 *p, size_t num_elements);
+void __ovld prefetch(const __global int2 *p, size_t num_elements);
+void __ovld prefetch(const __global uint2 *p, size_t num_elements);
+void __ovld prefetch(const __global long2 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong2 *p, size_t num_elements);
+void __ovld prefetch(const __global float2 *p, size_t num_elements);
+void __ovld prefetch(const __global char3 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar3 *p, size_t num_elements);
+void __ovld prefetch(const __global short3 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort3 *p, size_t num_elements);
+void __ovld prefetch(const __global int3 *p, size_t num_elements);
+void __ovld prefetch(const __global uint3 *p, size_t num_elements);
+void __ovld prefetch(const __global long3 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong3 *p, size_t num_elements);
+void __ovld prefetch(const __global float3 *p, size_t num_elements);
+void __ovld prefetch(const __global char4 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar4 *p, size_t num_elements);
+void __ovld prefetch(const __global short4 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort4 *p, size_t num_elements);
+void __ovld prefetch(const __global int4 *p, size_t num_elements);
+void __ovld prefetch(const __global uint4 *p, size_t num_elements);
+void __ovld prefetch(const __global long4 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong4 *p, size_t num_elements);
+void __ovld prefetch(const __global float4 *p, size_t num_elements);
+void __ovld prefetch(const __global char8 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar8 *p, size_t num_elements);
+void __ovld prefetch(const __global short8 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort8 *p, size_t num_elements);
+void __ovld prefetch(const __global int8 *p, size_t num_elements);
+void __ovld prefetch(const __global uint8 *p, size_t num_elements);
+void __ovld prefetch(const __global long8 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong8 *p, size_t num_elements);
+void __ovld prefetch(const __global float8 *p, size_t num_elements);
+void __ovld prefetch(const __global char16 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar16 *p, size_t num_elements);
+void __ovld prefetch(const __global short16 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort16 *p, size_t num_elements);
+void __ovld prefetch(const __global int16 *p, size_t num_elements);
+void __ovld prefetch(const __global uint16 *p, size_t num_elements);
+void __ovld prefetch(const __global long16 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong16 *p, size_t num_elements);
+void __ovld prefetch(const __global float16 *p, size_t num_elements);
+#ifdef cl_khr_fp64
+void __ovld prefetch(const __global double *p, size_t num_elements);
+void __ovld prefetch(const __global double2 *p, size_t num_elements);
+void __ovld prefetch(const __global double3 *p, size_t num_elements);
+void __ovld prefetch(const __global double4 *p, size_t num_elements);
+void __ovld prefetch(const __global double8 *p, size_t num_elements);
+void __ovld prefetch(const __global double16 *p, size_t num_elements);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld prefetch(const __global half *p, size_t num_elements);
+void __ovld prefetch(const __global half2 *p, size_t num_elements);
+void __ovld prefetch(const __global half3 *p, size_t num_elements);
+void __ovld prefetch(const __global half4 *p, size_t num_elements);
+void __ovld prefetch(const __global half8 *p, size_t num_elements);
+void __ovld prefetch(const __global half16 *p, size_t num_elements);
+#endif // cl_khr_fp16
+
+// OpenCL v1.1 s6.11.1, v1.2 s6.12.11 - Atomic Functions
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#endif
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old + val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_add(volatile __global int *p, int val);
+unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_add(volatile __local int *p, int val);
+unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_add(volatile __global int *p, int val);
+unsigned int __ovld atom_add(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_add(volatile __local int *p, int val);
+unsigned int __ovld atom_add(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_add(volatile __global long *p, long val);
+unsigned long __ovld atom_add(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_add(volatile __local long *p, long val);
+unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old) stored at location pointed by p.
+ * Compute (old - val) and store result at location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_sub(volatile __global int *p, int val);
+unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_sub(volatile __local int *p, int val);
+unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_sub(volatile __global int *p, int val);
+unsigned int __ovld atom_sub(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_sub(volatile __local int *p, int val);
+unsigned int __ovld atom_sub(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_sub(volatile __global long *p, long val);
+unsigned long __ovld atom_sub(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_sub(volatile __local long *p, long val);
+unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Swaps the old value stored at location p
+ * with new value given by val. Returns old
+ * value.
+ */
+int __ovld atomic_xchg(volatile __global int *p, int val);
+unsigned int __ovld atomic_xchg(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_xchg(volatile __local int *p, int val);
+unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val);
+float __ovld atomic_xchg(volatile __global float *p, float val);
+float __ovld atomic_xchg(volatile __local float *p, float val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_xchg(volatile __global int *p, int val);
+int __ovld atom_xchg(volatile __local int *p, int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
+unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_xchg(volatile __global long *p, long val);
+long __ovld atom_xchg(volatile __local long *p, long val);
+unsigned long __ovld atom_xchg(volatile __global unsigned long *p, unsigned long val);
+unsigned long __ovld atom_xchg(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old + 1) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_inc(volatile __global int *p);
+unsigned int __ovld atomic_inc(volatile __global unsigned int *p);
+int __ovld atomic_inc(volatile __local int *p);
+unsigned int __ovld atomic_inc(volatile __local unsigned int *p);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_inc(volatile __global int *p);
+unsigned int __ovld atom_inc(volatile __global unsigned int *p);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_inc(volatile __local int *p);
+unsigned int __ovld atom_inc(volatile __local unsigned int *p);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_inc(volatile __global long *p);
+unsigned long __ovld atom_inc(volatile __global unsigned long *p);
+long __ovld atom_inc(volatile __local long *p);
+unsigned long __ovld atom_inc(volatile __local unsigned long *p);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old - 1) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_dec(volatile __global int *p);
+unsigned int __ovld atomic_dec(volatile __global unsigned int *p);
+int __ovld atomic_dec(volatile __local int *p);
+unsigned int __ovld atomic_dec(volatile __local unsigned int *p);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_dec(volatile __global int *p);
+unsigned int __ovld atom_dec(volatile __global unsigned int *p);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_dec(volatile __local int *p);
+unsigned int __ovld atom_dec(volatile __local unsigned int *p);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_dec(volatile __global long *p);
+unsigned long __ovld atom_dec(volatile __global unsigned long *p);
+long __ovld atom_dec(volatile __local long *p);
+unsigned long __ovld atom_dec(volatile __local unsigned long *p);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old == cmp) ? val : old and store result at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
+unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
+unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
+unsigned int __ovld atom_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
+unsigned int __ovld atom_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
+unsigned long __ovld atom_cmpxchg(volatile __global unsigned long *p, unsigned long cmp, unsigned long val);
+long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
+unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned long cmp, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * min(old, val) and store minimum value at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_min(volatile __global int *p, int val);
+unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_min(volatile __local int *p, int val);
+unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_min(volatile __global int *p, int val);
+unsigned int __ovld atom_min(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_min(volatile __local int *p, int val);
+unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_min(volatile __global long *p, long val);
+unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+long __ovld atom_min(volatile __local long *p, long val);
+unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * max(old, val) and store maximum value at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_max(volatile __global int *p, int val);
+unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_max(volatile __local int *p, int val);
+unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_max(volatile __global int *p, int val);
+unsigned int __ovld atom_max(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_max(volatile __local int *p, int val);
+unsigned int __ovld atom_max(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_max(volatile __global long *p, long val);
+unsigned long __ovld atom_max(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_max(volatile __local long *p, long val);
+unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old & val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_and(volatile __global int *p, int val);
+unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_and(volatile __local int *p, int val);
+unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_and(volatile __global int *p, int val);
+unsigned int __ovld atom_and(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_and(volatile __local int *p, int val);
+unsigned int __ovld atom_and(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_and(volatile __global long *p, long val);
+unsigned long __ovld atom_and(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_and(volatile __local long *p, long val);
+unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old | val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_or(volatile __global int *p, int val);
+unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_or(volatile __local int *p, int val);
+unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_or(volatile __global int *p, int val);
+unsigned int __ovld atom_or(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_or(volatile __local int *p, int val);
+unsigned int __ovld atom_or(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_or(volatile __global long *p, long val);
+unsigned long __ovld atom_or(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_or(volatile __local long *p, long val);
+unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old ^ val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_xor(volatile __global int *p, int val);
+unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_xor(volatile __local int *p, int val);
+unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_xor(volatile __global int *p, int val);
+unsigned int __ovld atom_xor(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_xor(volatile __local int *p, int val);
+unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_xor(volatile __global long *p, long val);
+unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_xor(volatile __local long *p, long val);
+unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
+#endif
+
+// OpenCL v2.0 s6.13.11 - Atomics Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#ifndef ATOMIC_VAR_INIT
+#define ATOMIC_VAR_INIT(x) (x)
+#endif //ATOMIC_VAR_INIT
+#define ATOMIC_FLAG_INIT 0
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum memory_order
+{
+  memory_order_relaxed,
+  memory_order_acquire,
+  memory_order_release,
+  memory_order_acq_rel,
+  memory_order_seq_cst
+} memory_order;
+
+// double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#endif
+
+// atomic_init()
+void __ovld atomic_init(volatile atomic_int *object, int value);
+void __ovld atomic_init(volatile atomic_uint *object, uint value);
+void __ovld atomic_init(volatile atomic_float *object, float value);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+void __ovld atomic_init(volatile atomic_long *object, long value);
+void __ovld atomic_init(volatile atomic_ulong *object, ulong value);
+#ifdef cl_khr_fp64
+void __ovld atomic_init(volatile atomic_double *object, double value);
+#endif //cl_khr_fp64
+#endif
+
+// atomic_work_item_fence()
+void __ovld atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
+
+// atomic_fetch()
+
+int __ovld atomic_fetch_add(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_add(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_sub(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_sub(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_or(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_or(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_xor(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_xor(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_and(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_and(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_min(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_min(volatile atomic_uint *object, int operand);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_max(volatile atomic_uint *object, int operand);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_add(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_sub(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_sub(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_or(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_or(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_xor(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_xor(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_and(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_and(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_min(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, long operand);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, long operand);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
+#endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+
+// OpenCL v2.0 s6.13.11.7.5:
+// add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t.
+// or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t.
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) 
+uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_sub(volatile atomic_uintptr_t *object, ptrdiff_t operand);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
+
+uintptr_t __ovld atomic_fetch_or(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_xor(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_and(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_min(volatile atomic_uintptr_t *object, intptr_t opermax);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);
+uintptr_t __ovld atomic_fetch_max(volatile atomic_uintptr_t *object, intptr_t opermax);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);
+
+intptr_t __ovld atomic_fetch_or(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_xor(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_and(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_min(volatile atomic_intptr_t *object, uintptr_t opermax);
+intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
+intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
+intptr_t __ovld atomic_fetch_max(volatile atomic_intptr_t *object, uintptr_t opermax);
+intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
+intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
+#endif
+
+// atomic_store()
+
+void __ovld atomic_store(volatile atomic_int *object, int desired);
+void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_uint *object, uint desired);
+void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_float *object, float desired);
+void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+void __ovld atomic_store(volatile atomic_double *object, double desired);
+void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+void __ovld atomic_store(volatile atomic_long *object, long desired);
+void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_ulong *object, ulong desired);
+void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+#endif
+
+// atomic_load()
+
+int __ovld atomic_load(volatile atomic_int *object);
+int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order);
+int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order, memory_scope scope);
+uint __ovld atomic_load(volatile atomic_uint *object);
+uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order);
+uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order, memory_scope scope);
+float __ovld atomic_load(volatile atomic_float *object);
+float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order);
+float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+double __ovld atomic_load(volatile atomic_double *object);
+double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order);
+double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+long __ovld atomic_load(volatile atomic_long *object);
+long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order);
+long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order, memory_scope scope);
+ulong __ovld atomic_load(volatile atomic_ulong *object);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order, memory_scope scope);
+#endif
+
+// atomic_exchange()
+
+int __ovld atomic_exchange(volatile atomic_int *object, int desired);
+int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order);
+int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
+uint __ovld atomic_exchange(volatile atomic_uint *object, uint desired);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
+float __ovld atomic_exchange(volatile atomic_float *object, float desired);
+float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order);
+float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+double __ovld atomic_exchange(volatile atomic_double *object, double desired);
+double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order);
+double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+long __ovld atomic_exchange(volatile atomic_long *object, long desired);
+long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order);
+long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
+ulong __ovld atomic_exchange(volatile atomic_ulong *object, ulong desired);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+#endif
+
+// atomic_compare_exchange_strong() and atomic_compare_exchange_weak()
+
+bool __ovld atomic_compare_exchange_strong(volatile atomic_int *object, int *expected, int desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_uint *object, uint *expected, uint desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_int *object, int *expected, int desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_uint *object, uint *expected, uint desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_float *object, float *expected, float desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_float *object, float *expected, float desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+bool __ovld atomic_compare_exchange_strong(volatile atomic_double *object, double *expected, double desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_double *object, double *expected, double desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+#endif //cl_khr_fp64
+bool __ovld atomic_compare_exchange_strong(volatile atomic_long *object, long *expected, long desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_long *object, long *expected, long desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong *object, ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong *object, ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+#endif
+
+// atomic_flag_test_and_set() and atomic_flag_clear()
+
+bool __ovld atomic_flag_test_and_set(volatile atomic_flag *object);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+void __ovld atomic_flag_clear(volatile atomic_flag *object);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.12, v1.2 s6.12.12, v2.0 s6.13.12 - Miscellaneous Vector Functions
+
+/**
+ * The shuffle and shuffle2 built-in functions construct
+ * a permutation of elements from one or two input
+ * vectors respectively that are of the same type,
+ * returning a vector with the same element type as the
+ * input and length that is the same as the shuffle mask.
+ * The size of each element in the mask must match the
+ * size of each element in the result. For shuffle, only
+ * the ilogb(2m-1) least significant bits of each mask
+ * element are considered. For shuffle2, only the
+ * ilogb(2m-1)+1 least significant bits of each mask
+ * element are considered. Other bits in the mask shall
+ * be ignored.
+ * The elements of the input vectors are numbered from
+ * left to right across one or both of the vectors. For this
+ * purpose, the number of elements in a vector is given
+ * by vec_step(gentypem). The shuffle mask operand
+ * specifies, for each element of the result vector, which
+ * element of the one or two input vectors the result
+ * element gets.
+ * Examples:
+ * uint4 mask = (uint4)(3, 2,
+ * 1, 0);
+ * float4 a;
+ * float4 r = shuffle(a, mask);
+ * // r.s0123 = a.wzyx
+ * uint8 mask = (uint8)(0, 1, 2, 3,
+ * 4, 5, 6, 7);
+ * float4 a, b;
+ * float8 r = shuffle2(a, b, mask);
+ * // r.s0123 = a.xyzw
+ * // r.s4567 = b.xyzw
+ * uint4 mask;
+ * float8 a;
+ * float4 b;
+ * b = shuffle(a, mask);
+ * Examples that are not valid are:
+ * uint8 mask;
+ * short16 a;
+ * short8 b;
+ * b = shuffle(a, mask); <- not valid
+ */
+char2 __ovld __cnfn shuffle(char2 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char4 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char8 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char16 x, uchar2 mask);
+
+uchar2 __ovld __cnfn shuffle(uchar2 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar4 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar8 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar16 x, uchar2 mask);
+
+short2 __ovld __cnfn shuffle(short2 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short4 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short8 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short16 x, ushort2 mask);
+
+ushort2 __ovld __cnfn shuffle(ushort2 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort4 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort8 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort16 x, ushort2 mask);
+
+int2 __ovld __cnfn shuffle(int2 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int4 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int8 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int16 x, uint2 mask);
+
+uint2 __ovld __cnfn shuffle(uint2 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint4 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint8 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint16 x, uint2 mask);
+
+long2 __ovld __cnfn shuffle(long2 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long4 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long8 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long16 x, ulong2 mask);
+
+ulong2 __ovld __cnfn shuffle(ulong2 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong4 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong8 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong16 x, ulong2 mask);
+
+float2 __ovld __cnfn shuffle(float2 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float4 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float8 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float16 x, uint2 mask);
+
+char4 __ovld __cnfn shuffle(char2 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char4 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char8 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char16 x, uchar4 mask);
+
+uchar4 __ovld __cnfn shuffle(uchar2 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar4 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar8 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar16 x, uchar4 mask);
+
+short4 __ovld __cnfn shuffle(short2 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short4 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short8 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short16 x, ushort4 mask);
+
+ushort4 __ovld __cnfn shuffle(ushort2 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort4 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort8 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort16 x, ushort4 mask);
+
+int4 __ovld __cnfn shuffle(int2 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int4 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int8 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int16 x, uint4 mask);
+
+uint4 __ovld __cnfn shuffle(uint2 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint4 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint8 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint16 x, uint4 mask);
+
+long4 __ovld __cnfn shuffle(long2 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long4 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long8 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long16 x, ulong4 mask);
+
+ulong4 __ovld __cnfn shuffle(ulong2 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong4 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong8 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong16 x, ulong4 mask);
+
+float4 __ovld __cnfn shuffle(float2 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float4 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float8 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float16 x, uint4 mask);
+
+char8 __ovld __cnfn shuffle(char2 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char4 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char8 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char16 x, uchar8 mask);
+
+uchar8 __ovld __cnfn shuffle(uchar2 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar4 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar8 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar16 x, uchar8 mask);
+
+short8 __ovld __cnfn shuffle(short2 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short4 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short8 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short16 x, ushort8 mask);
+
+ushort8 __ovld __cnfn shuffle(ushort2 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort4 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort8 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort16 x, ushort8 mask);
+
+int8 __ovld __cnfn shuffle(int2 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int4 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int8 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int16 x, uint8 mask);
+
+uint8 __ovld __cnfn shuffle(uint2 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint4 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint8 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint16 x, uint8 mask);
+
+long8 __ovld __cnfn shuffle(long2 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long4 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long8 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long16 x, ulong8 mask);
+
+ulong8 __ovld __cnfn shuffle(ulong2 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong4 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong8 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong16 x, ulong8 mask);
+
+float8 __ovld __cnfn shuffle(float2 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float4 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float8 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float16 x, uint8 mask);
+
+char16 __ovld __cnfn shuffle(char2 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char4 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char8 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char16 x, uchar16 mask);
+
+uchar16 __ovld __cnfn shuffle(uchar2 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar4 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar8 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar16 x, uchar16 mask);
+
+short16 __ovld __cnfn shuffle(short2 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short4 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short8 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short16 x, ushort16 mask);
+
+ushort16 __ovld __cnfn shuffle(ushort2 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort4 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort8 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort16 x, ushort16 mask);
+
+int16 __ovld __cnfn shuffle(int2 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int4 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int8 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int16 x, uint16 mask);
+
+uint16 __ovld __cnfn shuffle(uint2 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint4 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint8 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint16 x, uint16 mask);
+
+long16 __ovld __cnfn shuffle(long2 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long4 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long8 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long16 x, ulong16 mask);
+
+ulong16 __ovld __cnfn shuffle(ulong2 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong4 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong8 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong16 x, ulong16 mask);
+
+float16 __ovld __cnfn shuffle(float2 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float4 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float8 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float16 x, uint16 mask);
+
+#ifdef cl_khr_fp64
+double2 __ovld __cnfn shuffle(double2 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double4 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double8 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double16 x, ulong2 mask);
+
+double4 __ovld __cnfn shuffle(double2 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double4 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double8 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double16 x, ulong4 mask);
+
+double8 __ovld __cnfn shuffle(double2 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double4 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double8 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double16 x, ulong8 mask);
+
+double16 __ovld __cnfn shuffle(double2 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double4 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double8 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double16 x, ulong16 mask);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half2 __ovld __cnfn shuffle(half2 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half4 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half8 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half16 x, ushort2 mask);
+
+half4 __ovld __cnfn shuffle(half2 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half4 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half8 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half16 x, ushort4 mask);
+
+half8 __ovld __cnfn shuffle(half2 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half4 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half8 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half16 x, ushort8 mask);
+
+half16 __ovld __cnfn shuffle(half2 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half4 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half8 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half16 x, ushort16 mask);
+#endif //cl_khr_fp16
+
+char2 __ovld __cnfn shuffle2(char2 x, char2 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char4 x, char4 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char8 x, char8 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char16 x, char16 y, uchar2 mask);
+
+uchar2 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar2 mask);
+
+short2 __ovld __cnfn shuffle2(short2 x, short2 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short4 x, short4 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short8 x, short8 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short16 x, short16 y, ushort2 mask);
+
+ushort2 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort2 mask);
+
+int2 __ovld __cnfn shuffle2(int2 x, int2 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int4 x, int4 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int8 x, int8 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int16 x, int16 y, uint2 mask);
+
+uint2 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint2 mask);
+
+long2 __ovld __cnfn shuffle2(long2 x, long2 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long4 x, long4 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long8 x, long8 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long16 x, long16 y, ulong2 mask);
+
+ulong2 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong2 mask);
+
+float2 __ovld __cnfn shuffle2(float2 x, float2 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float4 x, float4 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float8 x, float8 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float16 x, float16 y, uint2 mask);
+
+char4 __ovld __cnfn shuffle2(char2 x, char2 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char4 x, char4 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char8 x, char8 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char16 x, char16 y, uchar4 mask);
+
+uchar4 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar4 mask);
+
+short4 __ovld __cnfn shuffle2(short2 x, short2 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short4 x, short4 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short8 x, short8 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short16 x, short16 y, ushort4 mask);
+
+ushort4 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort4 mask);
+
+int4 __ovld __cnfn shuffle2(int2 x, int2 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int4 x, int4 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int8 x, int8 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int16 x, int16 y, uint4 mask);
+
+uint4 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint4 mask);
+
+long4 __ovld __cnfn shuffle2(long2 x, long2 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long4 x, long4 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long8 x, long8 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long16 x, long16 y, ulong4 mask);
+
+ulong4 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong4 mask);
+
+float4 __ovld __cnfn shuffle2(float2 x, float2 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float4 x, float4 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float8 x, float8 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float16 x, float16 y, uint4 mask);
+
+char8 __ovld __cnfn shuffle2(char2 x, char2 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char4 x, char4 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char8 x, char8 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char16 x, char16 y, uchar8 mask);
+
+uchar8 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar8 mask);
+
+short8 __ovld __cnfn shuffle2(short2 x, short2 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short4 x, short4 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short8 x, short8 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short16 x, short16 y, ushort8 mask);
+
+ushort8 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort8 mask);
+
+int8 __ovld __cnfn shuffle2(int2 x, int2 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int4 x, int4 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int8 x, int8 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int16 x, int16 y, uint8 mask);
+
+uint8 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint8 mask);
+
+long8 __ovld __cnfn shuffle2(long2 x, long2 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long4 x, long4 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long8 x, long8 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long16 x, long16 y, ulong8 mask);
+
+ulong8 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong8 mask);
+
+float8 __ovld __cnfn shuffle2(float2 x, float2 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float4 x, float4 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float8 x, float8 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float16 x, float16 y, uint8 mask);
+
+char16 __ovld __cnfn shuffle2(char2 x, char2 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char4 x, char4 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char8 x, char8 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char16 x, char16 y, uchar16 mask);
+
+uchar16 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar16 mask);
+
+short16 __ovld __cnfn shuffle2(short2 x, short2 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short4 x, short4 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short8 x, short8 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short16 x, short16 y, ushort16 mask);
+
+ushort16 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort16 mask);
+
+int16 __ovld __cnfn shuffle2(int2 x, int2 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int4 x, int4 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int8 x, int8 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int16 x, int16 y, uint16 mask);
+
+uint16 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint16 mask);
+
+long16 __ovld __cnfn shuffle2(long2 x, long2 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long4 x, long4 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long8 x, long8 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long16 x, long16 y, ulong16 mask);
+
+ulong16 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong16 mask);
+
+float16 __ovld __cnfn shuffle2(float2 x, float2 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float4 x, float4 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float8 x, float8 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float16 x, float16 y, uint16 mask);
+
+#ifdef cl_khr_fp64
+double2 __ovld __cnfn shuffle2(double2 x, double2 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double4 x, double4 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double8 x, double8 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double16 x, double16 y, ulong2 mask);
+
+double4 __ovld __cnfn shuffle2(double2 x, double2 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double4 x, double4 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double8 x, double8 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double16 x, double16 y, ulong4 mask);
+
+double8 __ovld __cnfn shuffle2(double2 x, double2 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double4 x, double4 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double8 x, double8 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double16 x, double16 y, ulong8 mask);
+
+double16 __ovld __cnfn shuffle2(double2 x, double2 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double4 x, double4 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double8 x, double8 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double16 x, double16 y, ulong16 mask);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half2 __ovld __cnfn shuffle2(half2 x, half2 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half4 x, half4 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half8 x, half8 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half16 x, half16 y, ushort2 mask);
+
+half4 __ovld __cnfn shuffle2(half2 x, half2 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half4 x, half4 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half8 x, half8 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half16 x, half16 y, ushort4 mask);
+
+half8 __ovld __cnfn shuffle2(half2 x, half2 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half4 x, half4 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half8 x, half8 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half16 x, half16 y, ushort8 mask);
+
+half16 __ovld __cnfn shuffle2(half2 x, half2 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half4 x, half4 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
+
+int printf(__constant const char* st, ...);
+#endif
+
+// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
+
+// These values need to match the runtime equivalent
+//
+// Addressing Mode.
+//
+#define CLK_ADDRESS_NONE                0
+#define CLK_ADDRESS_CLAMP_TO_EDGE       2
+#define CLK_ADDRESS_CLAMP               4
+#define CLK_ADDRESS_REPEAT              6
+#define CLK_ADDRESS_MIRRORED_REPEAT     8
+
+//
+// Coordination Normalization
+//
+#define CLK_NORMALIZED_COORDS_FALSE     0
+#define CLK_NORMALIZED_COORDS_TRUE      1
+
+//
+// Filtering Mode.
+//
+#define CLK_FILTER_NEAREST              0x10
+#define CLK_FILTER_LINEAR               0x20
+
+#ifdef cl_khr_gl_msaa_sharing
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
+#endif //cl_khr_gl_msaa_sharing
+
+/**
+ * Use the coordinate (coord.xy) to do an element lookup in
+ * the 2D image object specified by image.
+ *
+ * Use the coordinate (coord.x, coord.y, coord.z) to do
+ * an element lookup in the 3D image object specified
+ * by image. coord.w is ignored.
+ *
+ * Use the coordinate (coord.z) to index into the
+ * 2D image array object specified by image_array
+ * and (coord.x, coord.y) to do an element lookup in
+ * the 2D image object specified by image.
+ *
+ * Use the coordinate (x) to do an element lookup in
+ * the 1D image object specified by image.
+ *
+ * Use the coordinate (coord.y) to index into the
+ * 1D image array object specified by image_array
+ * and (coord.x) to do an element lookup in
+ * the 1D image object specified by image.
+ *
+ * Use the coordinate (cood.xy) and sample to do an
+ * element lookup in the 2D multi-sample image specified
+ * by image.
+ *
+ * Use coord.xy and sample to do an element
+ * lookup in the 2D multi-sample image layer
+ * identified by index coord.z in the 2D multi-sample
+ * image array specified by image.
+ *
+ * For mipmap images, use the mip-level specified by
+ * the Level-of-Detail (lod) or use gradients for LOD
+ * computation.
+ *
+ * read_imagef returns floating-point values in the
+ * range [0.0 ... 1.0] for image objects created with
+ * image_channel_data_type set to one of the predefined
+ * packed formats or CL_UNORM_INT8, or
+ * CL_UNORM_INT16.
+ *
+ * read_imagef returns floating-point values in the
+ * range [-1.0 ... 1.0] for image objects created with
+ * image_channel_data_type set to CL_SNORM_INT8,
+ * or CL_SNORM_INT16.
+ *
+ * read_imagef returns floating-point values for image
+ * objects created with image_channel_data_type set to
+ * CL_HALF_FLOAT or CL_FLOAT.
+ *
+ * read_imagei and read_imageui return
+ * unnormalized signed integer and unsigned integer
+ * values respectively. Each channel will be stored in a
+ * 32-bit integer.
+ *
+ * read_imagei can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_SIGNED_INT8,
+ * CL_SIGNED_INT16 and
+ * CL_SIGNED_INT32.
+ * If the image_channel_data_type is not one of the
+ * above values, the values returned by read_imagei
+ * are undefined.
+ *
+ * read_imageui can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_UNSIGNED_INT8,
+ * CL_UNSIGNED_INT16 and
+ * CL_UNSIGNED_INT32.
+ * If the image_channel_data_type is not one of the
+ * above values, the values returned by read_imageui
+ * are undefined.
+ *
+ * The read_image{i|ui} calls support a nearest filter
+ * only. The filter_mode specified in sampler
+ * must be set to CLK_FILTER_NEAREST; otherwise
+ * the values returned are undefined.
+ 
+ * The read_image{f|i|ui} calls that take
+ * integer coordinates must use a sampler with
+ * normalized coordinates set to
+ * CLK_NORMALIZED_COORDS_FALSE and
+ * addressing mode set to
+ * CLK_ADDRESS_CLAMP_TO_EDGE,
+ * CLK_ADDRESS_CLAMP or CLK_ADDRESS_NONE;
+ * otherwise the values returned are undefined.
+ *
+ * Values returned by read_imagef for image objects
+ * with image_channel_data_type values not specified
+ * in the description above are undefined.
+ */
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, int2 coord);
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord);
+
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, int4 coord);
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord);
+
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, int coord);
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord);
+
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord);
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, int2 coord);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord);
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, int4 coord);
+#endif //cl_khr_depth_images
+
+#if defined(cl_khr_gl_msaa_sharing)
+float4 __purefn __ovld read_imagef(read_only image2d_msaa_t image, int2 coord, int sample);
+int4 __purefn __ovld read_imagei(read_only image2d_msaa_t image, int2 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_only image2d_msaa_t image, int2 coord, int sample);
+
+float __purefn __ovld read_imagef(read_only image2d_msaa_depth_t image, int2 coord, int sample);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_msaa_t image, int4 coord, int sample);
+int4 __purefn __ovld read_imagei(read_only image2d_array_msaa_t image, int4 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_msaa_t image, int4 coord, int sample);
+
+float __purefn __ovld read_imagef(read_only image2d_array_msaa_depth_t image, int4 coord, int sample);
+#endif //cl_khr_gl_msaa_sharing
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+#endif //cl_khr_mipmap_image
+
+/**
+* Sampler-less Image Access
+*/
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_buffer_t image, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_buffer_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_buffer_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image, int4 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, int2 coord);
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, int4 coord);
+#endif //cl_khr_depth_images
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, int4 coord);
+
+// Image read functions returning half4 type
+#ifdef cl_khr_fp16
+half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, int coord);
+half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, float coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, float2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, float2 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, float4 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, float4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_t image, int coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord);
+#endif //cl_khr_fp16
+
+// Image read functions for read_write images
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float4 __purefn __ovld read_imagef(read_write image1d_t image, int coord);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_write image1d_buffer_t image, int coord);
+int4 __purefn __ovld read_imagei(read_write image1d_buffer_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_buffer_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image, int4 coord);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, int4 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, int2 coord);
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, int4 coord);
+#endif //cl_khr_depth_images
+
+#if cl_khr_gl_msaa_sharing
+float4 __purefn __ovld read_imagef(read_write image2d_msaa_t image, int2 coord, int sample);
+int4 __purefn __ovld read_imagei(read_write image2d_msaa_t image, int2 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_write image2d_msaa_t image, int2 coord, int sample);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_msaa_t image, int4 coord, int sample);
+int4 __purefn __ovld read_imagei(read_write image2d_array_msaa_t image, int4 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_msaa_t image, int4 coord, int sample);
+
+float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 coord, int sample);
+float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample);
+#endif //cl_khr_gl_msaa_sharing
+
+#ifdef cl_khr_mipmap_image
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+#endif //cl_khr_mipmap_image
+
+// Image read functions returning half4 type
+#ifdef cl_khr_fp16
+half4 __purefn __ovld read_imageh(read_write image1d_t image, int coord);
+half4 __purefn __ovld read_imageh(read_write image2d_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_write image3d_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_write image1d_array_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_write image2d_array_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_write image1d_buffer_t image, int coord);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y) in the 2D image object specified by image.
+ * (coord.x, coord.y) are considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1, and 0
+ * ... image height - 1.
+
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y) in the 2D image object specified by index
+ * (coord.z) of the 2D image array object image_array.
+ * (coord.x, coord.y) are considered to be unnormalized
+ * coordinates and must be in the range 0 ... image width
+ * - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord) in the 1D image (buffer) object specified by image.
+ * coord is considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord.x) in the 1D image object specified by index
+ * (coord.y) of the 1D image array object image_array.
+ * x is considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y, coord.z) in the 3D image object specified by image.
+ * coord.x & coord.y are considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1, and 0
+ * ... image height - 1.
+ *
+ * For mipmap images, use mip-level specified by lod.
+ *
+ * Appropriate data format conversion to the specified
+ * image format is done before writing the color value.
+ *
+ * write_imagef can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the pre-defined packed formats or set to
+ * CL_SNORM_INT8, CL_UNORM_INT8,
+ * CL_SNORM_INT16, CL_UNORM_INT16,
+ * CL_HALF_FLOAT or CL_FLOAT. Appropriate data
+ * format conversion will be done to convert channel
+ * data from a floating-point value to actual data format
+ * in which the channels are stored.
+ *
+ * write_imagei can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_SIGNED_INT8,
+ * CL_SIGNED_INT16 and
+ * CL_SIGNED_INT32.
+ *
+ * write_imageui can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_UNSIGNED_INT8,
+ * CL_UNSIGNED_INT16 and
+ * CL_UNSIGNED_INT32.
+ *
+ * The behavior of write_imagef, write_imagei and
+ * write_imageui for image objects created with
+ * image_channel_data_type values not specified in
+ * the description above or with (x, y) coordinate
+ * values that are not in the range (0 ... image width -1,
+ * 0 ... image height - 1), respectively, is undefined.
+ */
+void __ovld write_imagef(write_only image2d_t image, int2 coord, float4 color);
+void __ovld write_imagei(write_only image2d_t image, int2 coord, int4 color);
+void __ovld write_imageui(write_only image2d_t image, int2 coord, uint4 color);
+
+void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, float4 color);
+void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int4 color);
+void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_t image, int coord, float4 color);
+void __ovld write_imagei(write_only image1d_t image, int coord, int4 color);
+void __ovld write_imageui(write_only image1d_t image, int coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_buffer_t image, int coord, float4 color);
+void __ovld write_imagei(write_only image1d_buffer_t image, int coord, int4 color);
+void __ovld write_imageui(write_only image1d_buffer_t image, int coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, float4 color);
+void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color);
+void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color);
+
+void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color);
+void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color);
+void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color);
+
+#ifdef cl_khr_depth_images
+void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color);
+void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, float color);
+#endif //cl_khr_depth_images
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
+void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
+void __ovld write_imageui(write_only image1d_t image, int coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_t image, int2 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image2d_t image, int2 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image2d_t image, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
+void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);
+
+void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
+#endif //cl_khr_mipmap_image
+
+// Image write functions for half4 type
+#ifdef cl_khr_fp16
+void __ovld write_imageh(write_only image1d_t image, int coord, half4 color);
+void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color);
+void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color);
+void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color);
+void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color);
+void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color);
+#endif //cl_khr_fp16
+
+// Image write functions for read_write images
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld write_imagef(read_write image2d_t image, int2 coord, float4 color);
+void __ovld write_imagei(read_write image2d_t image, int2 coord, int4 color);
+void __ovld write_imageui(read_write image2d_t image, int2 coord, uint4 color);
+
+void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, float4 color);
+void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int4 color);
+void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_t image, int coord, float4 color);
+void __ovld write_imagei(read_write image1d_t image, int coord, int4 color);
+void __ovld write_imageui(read_write image1d_t image, int coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_buffer_t image, int coord, float4 color);
+void __ovld write_imagei(read_write image1d_buffer_t image, int coord, int4 color);
+void __ovld write_imageui(read_write image1d_buffer_t image, int coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, float4 color);
+void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color);
+void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color);
+
+void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color);
+void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color);
+void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color);
+
+#ifdef cl_khr_depth_images
+void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color);
+void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color);
+#endif //cl_khr_depth_images
+
+#ifdef cl_khr_mipmap_image
+void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
+void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
+void __ovld write_imageui(read_write image1d_t image, int coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_t image, int2 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image2d_t image, int2 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image2d_t image, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color);
+void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color);
+
+void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
+#endif //cl_khr_mipmap_image
+
+// Image write functions for half4 type
+#ifdef cl_khr_fp16
+void __ovld write_imageh(read_write image1d_t image, int coord, half4 color);
+void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color);
+void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color);
+void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color);
+void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);
+void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// Note: In OpenCL v1.0/1.1/1.2, image argument of image query builtin functions does not have
+// access qualifier, which by default assume read_only access qualifier. Image query builtin
+// functions with write_only image argument should also be declared.
+
+/**
+ * Return the image width in pixels.
+ *
+  */
+int __ovld __cnfn get_image_width(read_only image1d_t image);
+int __ovld __cnfn get_image_width(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_width(read_only image2d_t image);
+int __ovld __cnfn get_image_width(read_only image3d_t image);
+int __ovld __cnfn get_image_width(read_only image1d_array_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_width(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_width(write_only image1d_t image);
+int __ovld __cnfn get_image_width(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_width(write_only image2d_t image);
+int __ovld __cnfn get_image_width(write_only image3d_t image);
+int __ovld __cnfn get_image_width(write_only image1d_array_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_width(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_width(read_write image1d_t image);
+int __ovld __cnfn get_image_width(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_width(read_write image2d_t image);
+int __ovld __cnfn get_image_width(read_write image3d_t image);
+int __ovld __cnfn get_image_width(read_write image1d_array_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_width(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image height in pixels.
+ */
+int __ovld __cnfn get_image_height(read_only image2d_t image);
+int __ovld __cnfn get_image_height(read_only image3d_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_height(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_height(write_only image2d_t image);
+int __ovld __cnfn get_image_height(write_only image3d_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_height(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_height(read_write image2d_t image);
+int __ovld __cnfn get_image_height(read_write image3d_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_height(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image depth in pixels.
+ */
+int __ovld __cnfn get_image_depth(read_only image3d_t image);
+
+int __ovld __cnfn get_image_depth(write_only image3d_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_depth(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+/**
+ * Return the image miplevels.
+ */
+
+int __ovld get_image_num_mip_levels(read_only image1d_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_t image);
+int __ovld get_image_num_mip_levels(read_only image3d_t image);
+
+int __ovld get_image_num_mip_levels(write_only image1d_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_t image);
+int __ovld get_image_num_mip_levels(write_only image3d_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_mip_levels(read_write image1d_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_t image);
+int __ovld get_image_num_mip_levels(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int __ovld get_image_num_mip_levels(read_only image1d_array_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_array_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_depth_t image);
+
+int __ovld get_image_num_mip_levels(write_only image1d_array_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_array_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_depth_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_mip_levels(read_write image1d_array_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_array_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#endif //cl_khr_mipmap_image
+
+/**
+ * Return the channel data type. Valid values are:
+ * CLK_SNORM_INT8
+ * CLK_SNORM_INT16
+ * CLK_UNORM_INT8
+ * CLK_UNORM_INT16
+ * CLK_UNORM_SHORT_565
+ * CLK_UNORM_SHORT_555
+ * CLK_UNORM_SHORT_101010
+ * CLK_SIGNED_INT8
+ * CLK_SIGNED_INT16
+ * CLK_SIGNED_INT32
+ * CLK_UNSIGNED_INT8
+ * CLK_UNSIGNED_INT16
+ * CLK_UNSIGNED_INT32
+ * CLK_HALF_FLOAT
+ * CLK_FLOAT
+ */
+
+//
+// Channel Datatype.
+//
+#define CLK_SNORM_INT8        0x10D0
+#define CLK_SNORM_INT16       0x10D1
+#define CLK_UNORM_INT8        0x10D2
+#define CLK_UNORM_INT16       0x10D3
+#define CLK_UNORM_SHORT_565   0x10D4
+#define CLK_UNORM_SHORT_555   0x10D5
+#define CLK_UNORM_INT_101010  0x10D6
+#define CLK_SIGNED_INT8       0x10D7
+#define CLK_SIGNED_INT16      0x10D8
+#define CLK_SIGNED_INT32      0x10D9
+#define CLK_UNSIGNED_INT8     0x10DA
+#define CLK_UNSIGNED_INT16    0x10DB
+#define CLK_UNSIGNED_INT32    0x10DC
+#define CLK_HALF_FLOAT        0x10DD
+#define CLK_FLOAT             0x10DE
+#define CLK_UNORM_INT24       0x10DF
+
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image channel order. Valid values are:
+ * CLK_A
+ * CLK_R
+ * CLK_Rx
+ * CLK_RG
+ * CLK_RGx
+ * CLK_RA
+ * CLK_RGB
+ * CLK_RGBx
+ * CLK_RGBA
+ * CLK_ARGB
+ * CLK_BGRA
+ * CLK_INTENSITY
+ * CLK_LUMINANCE
+ */
+// Channel order, numbering must be aligned with cl_channel_order in cl.h
+//
+#define CLK_R         0x10B0
+#define CLK_A         0x10B1
+#define CLK_RG        0x10B2
+#define CLK_RA        0x10B3
+#define CLK_RGB       0x10B4
+#define CLK_RGBA      0x10B5
+#define CLK_BGRA      0x10B6
+#define CLK_ARGB      0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx                0x10BA
+#define CLK_RGx               0x10BB
+#define CLK_RGBx              0x10BC
+#define CLK_DEPTH             0x10BD
+#define CLK_DEPTH_STENCIL     0x10BE
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define CLK_sRGB              0x10BF
+#define CLK_sRGBA             0x10C1
+#define CLK_sRGBx             0x10C0
+#define CLK_sBGRA             0x10C2
+#define CLK_ABGR              0x10C3
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image3d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_channel_order(write_only image1d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image3d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_channel_order(read_write image1d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image3d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the 2D image width and height as an int2
+ * type. The width is returned in the x component, and
+ * the height in the y component.
+ */
+int2 __ovld __cnfn get_image_dim(read_only image2d_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int2 __ovld __cnfn get_image_dim(write_only image2d_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int2 __ovld __cnfn get_image_dim(read_write image2d_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the 3D image width, height, and depth as an
+ * int4 type. The width is returned in the x
+ * component, height in the y component, depth in the z
+ * component and the w component is 0.
+ */
+int4 __ovld __cnfn get_image_dim(read_only image3d_t image);
+int4 __ovld __cnfn get_image_dim(write_only image3d_t image);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int4 __ovld __cnfn get_image_dim(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image array size.
+ */
+
+size_t __ovld __cnfn get_image_array_size(read_only image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+
+size_t __ovld __cnfn get_image_array_size(write_only image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+size_t __ovld __cnfn get_image_array_size(read_write image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+* Return the number of samples associated with image
+*/
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld get_image_num_samples(read_only image2d_msaa_t image);
+int __ovld get_image_num_samples(read_only image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
+
+int __ovld get_image_num_samples(write_only image2d_msaa_t image);
+int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_samples(read_write image2d_msaa_t image);
+int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif
+
+// OpenCL v2.0 s6.13.15 - Work-group Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __conv work_group_all(int predicate);
+int __ovld __conv work_group_any(int predicate);
+
+#ifdef cl_khr_fp16
+half __ovld __conv work_group_broadcast(half a, size_t local_id);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z);
+#endif
+int __ovld __conv work_group_broadcast(int a, size_t local_id);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z);
+uint __ovld __conv work_group_broadcast(uint a, size_t local_id);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z);
+long __ovld __conv work_group_broadcast(long a, size_t local_id);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
+float __ovld __conv work_group_broadcast(float a, size_t local_id);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z);
+#ifdef cl_khr_fp64
+double __ovld __conv work_group_broadcast(double a, size_t local_id);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld __conv work_group_reduce_add(half x);
+half __ovld __conv work_group_reduce_min(half x);
+half __ovld __conv work_group_reduce_max(half x);
+half __ovld __conv work_group_scan_exclusive_add(half x);
+half __ovld __conv work_group_scan_exclusive_min(half x);
+half __ovld __conv work_group_scan_exclusive_max(half x);
+half __ovld __conv work_group_scan_inclusive_add(half x);
+half __ovld __conv work_group_scan_inclusive_min(half x);
+half __ovld __conv work_group_scan_inclusive_max(half x);
+#endif
+int __ovld __conv work_group_reduce_add(int x);
+int __ovld __conv work_group_reduce_min(int x);
+int __ovld __conv work_group_reduce_max(int x);
+int __ovld __conv work_group_scan_exclusive_add(int x);
+int __ovld __conv work_group_scan_exclusive_min(int x);
+int __ovld __conv work_group_scan_exclusive_max(int x);
+int __ovld __conv work_group_scan_inclusive_add(int x);
+int __ovld __conv work_group_scan_inclusive_min(int x);
+int __ovld __conv work_group_scan_inclusive_max(int x);
+uint __ovld __conv work_group_reduce_add(uint x);
+uint __ovld __conv work_group_reduce_min(uint x);
+uint __ovld __conv work_group_reduce_max(uint x);
+uint __ovld __conv work_group_scan_exclusive_add(uint x);
+uint __ovld __conv work_group_scan_exclusive_min(uint x);
+uint __ovld __conv work_group_scan_exclusive_max(uint x);
+uint __ovld __conv work_group_scan_inclusive_add(uint x);
+uint __ovld __conv work_group_scan_inclusive_min(uint x);
+uint __ovld __conv work_group_scan_inclusive_max(uint x);
+long __ovld __conv work_group_reduce_add(long x);
+long __ovld __conv work_group_reduce_min(long x);
+long __ovld __conv work_group_reduce_max(long x);
+long __ovld __conv work_group_scan_exclusive_add(long x);
+long __ovld __conv work_group_scan_exclusive_min(long x);
+long __ovld __conv work_group_scan_exclusive_max(long x);
+long __ovld __conv work_group_scan_inclusive_add(long x);
+long __ovld __conv work_group_scan_inclusive_min(long x);
+long __ovld __conv work_group_scan_inclusive_max(long x);
+ulong __ovld __conv work_group_reduce_add(ulong x);
+ulong __ovld __conv work_group_reduce_min(ulong x);
+ulong __ovld __conv work_group_reduce_max(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_max(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_max(ulong x);
+float __ovld __conv work_group_reduce_add(float x);
+float __ovld __conv work_group_reduce_min(float x);
+float __ovld __conv work_group_reduce_max(float x);
+float __ovld __conv work_group_scan_exclusive_add(float x);
+float __ovld __conv work_group_scan_exclusive_min(float x);
+float __ovld __conv work_group_scan_exclusive_max(float x);
+float __ovld __conv work_group_scan_inclusive_add(float x);
+float __ovld __conv work_group_scan_inclusive_min(float x);
+float __ovld __conv work_group_scan_inclusive_max(float x);
+#ifdef cl_khr_fp64
+double __ovld __conv work_group_reduce_add(double x);
+double __ovld __conv work_group_reduce_min(double x);
+double __ovld __conv work_group_reduce_max(double x);
+double __ovld __conv work_group_scan_exclusive_add(double x);
+double __ovld __conv work_group_scan_exclusive_min(double x);
+double __ovld __conv work_group_scan_exclusive_max(double x);
+double __ovld __conv work_group_scan_inclusive_add(double x);
+double __ovld __conv work_group_scan_inclusive_min(double x);
+double __ovld __conv work_group_scan_inclusive_max(double x);
+#endif //cl_khr_fp64
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v2.0 s6.13.16 - Pipe Functions
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define PIPE_RESERVE_ID_VALID_BIT (1U << 30)
+#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
+bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+
+// OpenCL v2.0 s6.13.17 - Enqueue Kernels
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#define CLK_SUCCESS                                 0
+#define CLK_ENQUEUE_FAILURE                         -101
+#define CLK_INVALID_QUEUE                           -102
+#define CLK_INVALID_NDRANGE                         -160
+#define CLK_INVALID_EVENT_WAIT_LIST                 -57
+#define CLK_DEVICE_QUEUE_FULL                       -161
+#define CLK_INVALID_ARG_SIZE                        -51
+#define CLK_EVENT_ALLOCATION_FAILURE                -100
+#define CLK_OUT_OF_RESOURCES                        -5
+
+#define CLK_NULL_QUEUE                              0
+#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
+
+// execution model related definitions
+#define CLK_ENQUEUE_FLAGS_NO_WAIT                   0x0
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL               0x1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP           0x2
+
+typedef int kernel_enqueue_flags_t;
+typedef int clk_profiling_info;
+
+// Profiling info name (see capture_event_profiling_info)
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
+
+#define MAX_WORK_DIM        3
+
+// ToDo: Remove definition of ndrange_t in Clang as an opaque type and add back
+// the following ndrange_t definition.
+#if 0
+typedef struct {
+    unsigned int workDimension;
+    size_t globalWorkOffset[MAX_WORK_DIM];
+    size_t globalWorkSize[MAX_WORK_DIM];
+    size_t localWorkSize[MAX_WORK_DIM];
+} ndrange_t;
+#endif
+
+ndrange_t __ovld ndrange_1D(size_t);
+ndrange_t __ovld ndrange_1D(size_t, size_t);
+ndrange_t __ovld ndrange_1D(size_t, size_t, size_t);
+
+ndrange_t __ovld ndrange_2D(const size_t[2]);
+ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2]);
+ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2], const size_t[2]);
+
+ndrange_t __ovld ndrange_3D(const size_t[3]);
+ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3]);
+ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3], const size_t[3]);
+
+int __ovld enqueue_marker(queue_t, uint, const __private clk_event_t*, __private clk_event_t*);
+
+void __ovld retain_event(clk_event_t);
+
+void __ovld release_event(clk_event_t);
+
+clk_event_t __ovld create_user_event(void);
+
+void __ovld set_user_event_status(clk_event_t e, int state);
+
+bool __ovld is_valid_event (clk_event_t event);
+
+void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
+
+queue_t __ovld get_default_queue(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL Extension v2.0 s9.17 - Sub-groups
+
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+// Shared Sub Group Functions
+uint    __ovld get_sub_group_size(void);
+uint    __ovld get_max_sub_group_size(void);
+uint    __ovld get_num_sub_groups(void);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+uint    __ovld get_enqueued_num_sub_groups(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+uint    __ovld get_sub_group_id(void);
+uint    __ovld get_sub_group_local_id(void);
+
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int     __ovld __conv sub_group_all(int predicate);
+int     __ovld __conv sub_group_any(int predicate);
+
+int     __ovld __conv sub_group_broadcast(int   x, uint sub_group_local_id);
+uint    __ovld __conv sub_group_broadcast(uint  x, uint sub_group_local_id);
+long    __ovld __conv sub_group_broadcast(long  x, uint sub_group_local_id);
+ulong   __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id);
+float   __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id);
+
+int     __ovld __conv sub_group_reduce_add(int   x);
+uint    __ovld __conv sub_group_reduce_add(uint  x);
+long    __ovld __conv sub_group_reduce_add(long  x);
+ulong   __ovld __conv sub_group_reduce_add(ulong x);
+float   __ovld __conv sub_group_reduce_add(float x);
+int     __ovld __conv sub_group_reduce_min(int   x);
+uint    __ovld __conv sub_group_reduce_min(uint  x);
+long    __ovld __conv sub_group_reduce_min(long  x);
+ulong   __ovld __conv sub_group_reduce_min(ulong x);
+float   __ovld __conv sub_group_reduce_min(float x);
+int     __ovld __conv sub_group_reduce_max(int   x);
+uint    __ovld __conv sub_group_reduce_max(uint  x);
+long    __ovld __conv sub_group_reduce_max(long  x);
+ulong   __ovld __conv sub_group_reduce_max(ulong x);
+float   __ovld __conv sub_group_reduce_max(float x);
+
+int     __ovld __conv sub_group_scan_exclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_add(float x);
+int     __ovld __conv sub_group_scan_exclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_min(float x);
+int     __ovld __conv sub_group_scan_exclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_max(float x);
+
+int     __ovld __conv sub_group_scan_inclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_add(float x);
+int     __ovld __conv sub_group_scan_inclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_min(float x);
+int     __ovld __conv sub_group_scan_inclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_max(float x);
+
+#ifdef cl_khr_fp16
+half    __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id);
+half    __ovld __conv sub_group_reduce_add(half x);
+half    __ovld __conv sub_group_reduce_min(half x);
+half    __ovld __conv sub_group_reduce_max(half x);
+half    __ovld __conv sub_group_scan_exclusive_add(half x);
+half    __ovld __conv sub_group_scan_exclusive_min(half x);
+half    __ovld __conv sub_group_scan_exclusive_max(half x);
+half    __ovld __conv sub_group_scan_inclusive_add(half x);
+half    __ovld __conv sub_group_scan_inclusive_min(half x);
+half    __ovld __conv sub_group_scan_inclusive_max(half x);
+#endif //cl_khr_fp16
+
+#ifdef cl_khr_fp64
+double  __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id);
+double  __ovld __conv sub_group_reduce_add(double x);
+double  __ovld __conv sub_group_reduce_min(double x);
+double  __ovld __conv sub_group_reduce_max(double x);
+double  __ovld __conv sub_group_scan_exclusive_add(double x);
+double  __ovld __conv sub_group_scan_exclusive_min(double x);
+double  __ovld __conv sub_group_scan_exclusive_max(double x);
+double  __ovld __conv sub_group_scan_inclusive_add(double x);
+double  __ovld __conv sub_group_scan_inclusive_min(double x);
+double  __ovld __conv sub_group_scan_inclusive_max(double x);
+#endif //cl_khr_fp64
+
+#endif //cl_khr_subgroups cl_intel_subgroups
+
+#ifdef cl_amd_media_ops
+uint __ovld amd_bitalign(uint a, uint b, uint c);
+uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_bytealign(uint a, uint b, uint c);
+uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_lerp(uint a, uint b, uint c);
+uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_pack(float4 v);
+
+uint __ovld amd_sad4(uint4 x, uint4 y, uint z);
+
+uint __ovld amd_sadhi(uint a, uint b, uint c);
+uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_sad(uint a, uint b, uint c);
+uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c);
+
+float __ovld amd_unpack0(uint a);
+float2 __ovld amd_unpack0(uint2 a);
+float3 __ovld amd_unpack0(uint3 a);
+float4 __ovld amd_unpack0(uint4 a);
+float8 __ovld amd_unpack0(uint8 a);
+float16 __ovld amd_unpack0(uint16 a);
+
+float __ovld amd_unpack1(uint a);
+float2 __ovld amd_unpack1(uint2 a);
+float3 __ovld amd_unpack1(uint3 a);
+float4 __ovld amd_unpack1(uint4 a);
+float8 __ovld amd_unpack1(uint8 a);
+float16 __ovld amd_unpack1(uint16 a);
+
+float __ovld amd_unpack2(uint a);
+float2 __ovld amd_unpack2(uint2 a);
+float3 __ovld amd_unpack2(uint3 a);
+float4 __ovld amd_unpack2(uint4 a);
+float8 __ovld amd_unpack2(uint8 a);
+float16 __ovld amd_unpack2(uint16 a);
+
+float __ovld amd_unpack3(uint a);
+float2 __ovld amd_unpack3(uint2 a);
+float3 __ovld amd_unpack3(uint3 a);
+float4 __ovld amd_unpack3(uint4 a);
+float8 __ovld amd_unpack3(uint8 a);
+float16 __ovld amd_unpack3(uint16 a);
+#endif // cl_amd_media_ops
+
+#ifdef cl_amd_media_ops2
+int __ovld amd_bfe(int src0, uint src1, uint src2);
+int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2);
+int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2);
+int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2);
+int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2);
+int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfe(uint src0, uint src1, uint src2);
+uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfm(uint src0, uint src1);
+uint2 __ovld amd_bfm(uint2 src0, uint2 src1);
+uint3 __ovld amd_bfm(uint3 src0, uint3 src1);
+uint4 __ovld amd_bfm(uint4 src0, uint4 src1);
+uint8 __ovld amd_bfm(uint8 src0, uint8 src1);
+uint16 __ovld amd_bfm(uint16 src0, uint16 src1);
+
+float __ovld amd_max3(float src0, float src1, float src2);
+float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_max3(int src0, int src1, int src2);
+int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_max3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_median3(float src0, float src1, float src2);
+float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_median3(int src0, int src1, int src2);
+int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_median3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_min3(float src0, float src1, float src);
+float2 __ovld amd_min3(float2 src0, float2 src1, float2 src);
+float3 __ovld amd_min3(float3 src0, float3 src1, float3 src);
+float4 __ovld amd_min3(float4 src0, float4 src1, float4 src);
+float8 __ovld amd_min3(float8 src0, float8 src1, float8 src);
+float16 __ovld amd_min3(float16 src0, float16 src1, float16 src);
+
+int __ovld amd_min3(int src0, int src1, int src2);
+int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_min3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2);
+
+ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+uint __ovld amd_msad(uint src0, uint src1, uint src2);
+uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadd(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadw(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
+#endif // cl_amd_media_ops2
+
+// Disable any extensions we may have enabled previously.
+#pragma OPENCL EXTENSION all : disable
+
+#undef __cnfn
+#undef __ovld
+#endif //_OPENCL_H_
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/pkuintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/pkuintrin.h
new file mode 100644
index 000000000..9e5459450
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/pkuintrin.h
@@ -0,0 +1,48 @@
+/*===------------- pkuintrin.h - PKU intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __PKUINTRIN_H
+#define __PKUINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rdpkru_u32(void)
+{
+  return __builtin_ia32_rdpkru();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_wrpkru(unsigned int __val)
+{
+  return __builtin_ia32_wrpkru(__val);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/pmmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/pmmintrin.h
new file mode 100644
index 000000000..d4f6487af
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/pmmintrin.h
@@ -0,0 +1,310 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+
+/// \brief Loads data from an unaligned memory location to elements in a 128-bit
+///    vector. If the address of the data is not 16-byte aligned, the
+///    instruction may read two adjacent aligned blocks of memory to retrieve
+///    the requested data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit integer vector containing integer values.
+/// \returns A 128-bit vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the right source operand.
+/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the lower
+///    bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the upper
+///    bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
+///    vector of [4 x float] to float values stored in a 128-bit vector of
+///    [4 x float]. 
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
+}
+
+/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
+///    [4 x float] to float values stored in a 128-bit vector of [4 x float]. 
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] \n
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the alternating sums
+///    and differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Horizontally adds the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Moves and duplicates one double-precision value to double-precision
+///    values stored in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_loaddup_pd(double const * dp);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param dp
+///    A pointer to a double-precision value to be moved and duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+/// \brief Moves and duplicates the double-precision value in the lower bits of
+///    a 128-bit vector of [2 x double] to double-precision values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
+///    [127:64] and [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+/// \brief Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+/// \brief Used with the MONITOR instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which may vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which may vary by processor.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/popcntintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/popcntintrin.h
new file mode 100644
index 000000000..0b4793e58
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/popcntintrin.h
@@ -0,0 +1,98 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    A signed 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ int __DEFAULT_FN_ATTRS
+_popcnt32(int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    A signed 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_popcnt64(long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _POPCNTINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/prfchwintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/prfchwintrin.h
new file mode 100644
index 000000000..ba0285751
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/prfchwintrin.h
@@ -0,0 +1,45 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/rdseedintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/rdseedintrin.h
new file mode 100644
index 000000000..421f4ea48
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/rdseedintrin.h
@@ -0,0 +1,56 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDSEEDINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/rtmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/rtmintrin.h
new file mode 100644
index 000000000..e6a58d743
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/rtmintrin.h
@@ -0,0 +1,59 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RTMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/s390intrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/s390intrin.h
new file mode 100644
index 000000000..d51274c07
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/s390intrin.h
@@ -0,0 +1,39 @@
+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __S390INTRIN_H
+#define __S390INTRIN_H
+
+#ifndef __s390__
+#error "<s390intrin.h> is for s390 only"
+#endif
+
+#ifdef __HTM__
+#include <htmintrin.h>
+#endif
+
+#ifdef __VEC__
+#include <vecintrin.h>
+#endif
+
+#endif /* __S390INTRIN_H*/
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/shaintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/shaintrin.h
new file mode 100644
index 000000000..9b5d21800
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/shaintrin.h
@@ -0,0 +1,75 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __SHAINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/smmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/smmintrin.h
new file mode 100644
index 000000000..e48ab034f
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/smmintrin.h
@@ -0,0 +1,507 @@
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _SMMINTRIN_H
+#define _SMMINTRIN_H
+
+#include <tmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+#define _mm_round_ps(X, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
+
+#define _mm_round_ss(X, Y, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                 (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_round_pd(X, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
+
+#define _mm_round_sd(X, Y, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                  (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Packed Blending Intrinsics.  */
+#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
+                                   (__v2df)(__m128d)(V2), \
+                                   (((M) & 0x01) ? 2 : 0), \
+                                   (((M) & 0x02) ? 3 : 1)); })
+
+#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
+                                  (((M) & 0x01) ? 4 : 0), \
+                                  (((M) & 0x02) ? 5 : 1), \
+                                  (((M) & 0x04) ? 6 : 2), \
+                                  (((M) & 0x08) ? 7 : 3)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+                                            (__v2df)__M);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+                                           (__v4sf)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+                                               (__v16qi)__M);
+}
+
+#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
+                                   (__v8hi)(__m128i)(V2), \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+/* SSE4 Dword Multiply Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions.  */
+#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                               (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_dp_pd(X, Y, M) __extension__ ({\
+  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Streaming Load Hint Instruction.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_stream_load_si128 (__m128i const *__V)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+#define _mm_extract_ps(X, N) (__extension__                      \
+                              ({ union { int __i; float __f; } __t;  \
+                                 __v4sf __a = (__v4sf)(__m128)(X);       \
+                                 __t.__f = __a[(N) & 3];                 \
+                                 __t.__i;}))
+
+/* Miscellaneous insert and extract macros.  */
+/* Extract a single-precision float from X at index N into D.  */
+#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
+                                                    (D) = __a[N]; }))
+
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+   an index suitable for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+
+/* Extract a float from X at index N into the first index of the return.  */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+
+/* Insert int into packed integer array at index.  */
+#define _mm_insert_epi8(X, I, N) (__extension__                           \
+                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                     __a[(N) & 15] = (I);                 \
+                                     (__m128i)__a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__                         \
+                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                      __a[(N) & 3] = (I);                \
+                                      (__m128i)__a;}))
+#ifdef __x86_64__
+#define _mm_insert_epi64(X, I, N) (__extension__                         \
+                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                      __a[(N) & 1] = (I);                \
+                                      (__m128i)__a;}))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index.  This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+#define _mm_extract_epi8(X, N) (__extension__                           \
+                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                   (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__                         \
+                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                    (int)__a[(N) & 3];}))
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) (__extension__                         \
+                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                    (long long)__a[(N) & 1];}))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons.  */
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi16(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi32(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi64(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Packed Integer Zero-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi16(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu32_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Pack with Unsigned Saturation.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                      (__v16qi)(__m128i)(Y), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_minpos_epu16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* Handle the sse4.2 definitions here. */
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+   so we'll do the same.  */
+
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+
+/* These specify the type of data that we're comparing.  */
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These specify the type of comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity of the operation.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics.  */
+#define _mm_cmpistrm(A, B, M) \
+  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                       (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistri(A, B, M) \
+  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+                                       (__v16qi)(__m128i)(B), (int)(LB), \
+                                       (int)(M))
+#define _mm_cmpestri(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+                                   (__v16qi)(__m128i)(B), (int)(LB), \
+                                   (int)(M))
+
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+#define _mm_cmpistra(A, B, M) \
+  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrc(A, B, M) \
+  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistro(A, B, M) \
+  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrs(A, B, M) \
+  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrz(A, B, M) \
+  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestra(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestro(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+
+/* SSE4.2 Compare Packed Data -- Greater Than.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+/* SSE4.2 Accumulate CRC32.  */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#endif /* _SMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdalign.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdalign.h
new file mode 100644
index 000000000..3738d1284
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdalign.h
@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdarg.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdarg.h
new file mode 100644
index 000000000..a57e18364
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdarg.h
@@ -0,0 +1,52 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdatomic.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdatomic.h
new file mode 100644
index 000000000..23bb3a357
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdatomic.h
@@ -0,0 +1,190 @@
+/*===---- stdatomic.h - Standard header for atomic types and operations -----===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_STDATOMIC_H
+#define __CLANG_STDATOMIC_H
+
+/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
+ * example, already has a Clang-compatible stdatomic.h header.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
+# include_next <stdatomic.h>
+#else
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 7.17.1 Introduction */
+
+#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#define ATOMIC_SHORT_LOCK_FREE      __GCC_ATOMIC_SHORT_LOCK_FREE
+#define ATOMIC_INT_LOCK_FREE        __GCC_ATOMIC_INT_LOCK_FREE
+#define ATOMIC_LONG_LOCK_FREE       __GCC_ATOMIC_LONG_LOCK_FREE
+#define ATOMIC_LLONG_LOCK_FREE      __GCC_ATOMIC_LLONG_LOCK_FREE
+#define ATOMIC_POINTER_LOCK_FREE    __GCC_ATOMIC_POINTER_LOCK_FREE
+
+/* 7.17.2 Initialization */
+
+#define ATOMIC_VAR_INIT(value) (value)
+#define atomic_init __c11_atomic_init
+
+/* 7.17.3 Order and consistency */
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+#define kill_dependency(y) (y)
+
+/* 7.17.4 Fences */
+
+/* These should be provided by the libc implementation. */
+void atomic_thread_fence(memory_order);
+void atomic_signal_fence(memory_order);
+
+#define atomic_thread_fence(order) __c11_atomic_thread_fence(order)
+#define atomic_signal_fence(order) __c11_atomic_signal_fence(order)
+
+/* 7.17.5 Lock-free property */
+
+#define atomic_is_lock_free(obj) __c11_atomic_is_lock_free(sizeof(*(obj)))
+
+/* 7.17.6 Atomic integer types */
+
+#ifdef __cplusplus
+typedef _Atomic(bool)               atomic_bool;
+#else
+typedef _Atomic(_Bool)              atomic_bool;
+#endif
+typedef _Atomic(char)               atomic_char;
+typedef _Atomic(signed char)        atomic_schar;
+typedef _Atomic(unsigned char)      atomic_uchar;
+typedef _Atomic(short)              atomic_short;
+typedef _Atomic(unsigned short)     atomic_ushort;
+typedef _Atomic(int)                atomic_int;
+typedef _Atomic(unsigned int)       atomic_uint;
+typedef _Atomic(long)               atomic_long;
+typedef _Atomic(unsigned long)      atomic_ulong;
+typedef _Atomic(long long)          atomic_llong;
+typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(uint_least16_t)     atomic_char16_t;
+typedef _Atomic(uint_least32_t)     atomic_char32_t;
+typedef _Atomic(wchar_t)            atomic_wchar_t;
+typedef _Atomic(int_least8_t)       atomic_int_least8_t;
+typedef _Atomic(uint_least8_t)      atomic_uint_least8_t;
+typedef _Atomic(int_least16_t)      atomic_int_least16_t;
+typedef _Atomic(uint_least16_t)     atomic_uint_least16_t;
+typedef _Atomic(int_least32_t)      atomic_int_least32_t;
+typedef _Atomic(uint_least32_t)     atomic_uint_least32_t;
+typedef _Atomic(int_least64_t)      atomic_int_least64_t;
+typedef _Atomic(uint_least64_t)     atomic_uint_least64_t;
+typedef _Atomic(int_fast8_t)        atomic_int_fast8_t;
+typedef _Atomic(uint_fast8_t)       atomic_uint_fast8_t;
+typedef _Atomic(int_fast16_t)       atomic_int_fast16_t;
+typedef _Atomic(uint_fast16_t)      atomic_uint_fast16_t;
+typedef _Atomic(int_fast32_t)       atomic_int_fast32_t;
+typedef _Atomic(uint_fast32_t)      atomic_uint_fast32_t;
+typedef _Atomic(int_fast64_t)       atomic_int_fast64_t;
+typedef _Atomic(uint_fast64_t)      atomic_uint_fast64_t;
+typedef _Atomic(intptr_t)           atomic_intptr_t;
+typedef _Atomic(uintptr_t)          atomic_uintptr_t;
+typedef _Atomic(size_t)             atomic_size_t;
+typedef _Atomic(ptrdiff_t)          atomic_ptrdiff_t;
+typedef _Atomic(intmax_t)           atomic_intmax_t;
+typedef _Atomic(uintmax_t)          atomic_uintmax_t;
+
+/* 7.17.7 Operations on atomic types */
+
+#define atomic_store(object, desired) __c11_atomic_store(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_store_explicit __c11_atomic_store
+
+#define atomic_load(object) __c11_atomic_load(object, __ATOMIC_SEQ_CST)
+#define atomic_load_explicit __c11_atomic_load
+
+#define atomic_exchange(object, desired) __c11_atomic_exchange(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_exchange_explicit __c11_atomic_exchange
+
+#define atomic_compare_exchange_strong(object, expected, desired) __c11_atomic_compare_exchange_strong(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
+
+#define atomic_compare_exchange_weak(object, expected, desired) __c11_atomic_compare_exchange_weak(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
+
+#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add_explicit __c11_atomic_fetch_add
+
+#define atomic_fetch_sub(object, operand) __c11_atomic_fetch_sub(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
+
+#define atomic_fetch_or(object, operand) __c11_atomic_fetch_or(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or_explicit __c11_atomic_fetch_or
+
+#define atomic_fetch_xor(object, operand) __c11_atomic_fetch_xor(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
+
+#define atomic_fetch_and(object, operand) __c11_atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and_explicit __c11_atomic_fetch_and
+
+/* 7.17.8 Atomic flag type and operations */
+
+typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
+
+#define ATOMIC_FLAG_INIT { 0 }
+
+/* These should be provided by the libc implementation. */
+#ifdef __cplusplus
+bool atomic_flag_test_and_set(volatile atomic_flag *);
+bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#else
+_Bool atomic_flag_test_and_set(volatile atomic_flag *);
+_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#endif
+void atomic_flag_clear(volatile atomic_flag *);
+void atomic_flag_clear_explicit(volatile atomic_flag *, memory_order);
+
+#define atomic_flag_test_and_set(object) __c11_atomic_exchange(&(object)->_Value, 1, __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set_explicit(object, order) __c11_atomic_exchange(&(object)->_Value, 1, order)
+
+#define atomic_flag_clear(object) __c11_atomic_store(&(object)->_Value, 0, __ATOMIC_SEQ_CST)
+#define atomic_flag_clear_explicit(object, order) __c11_atomic_store(&(object)->_Value, 0, order)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDATOMIC_H */
+
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdbool.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdbool.h
new file mode 100644
index 000000000..0467893f3
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdbool.h
@@ -0,0 +1,44 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Define _Bool, bool, false, true as a GNU extension. */
+#define _Bool bool
+#define bool  bool
+#define false false
+#define true  true
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stddef.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stddef.h
new file mode 100644
index 000000000..735499671
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stddef.h
@@ -0,0 +1,137 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
+    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
+    defined(__need_NULL) || defined(__need_wint_t)
+
+#if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
+    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
+    !defined(__need_wint_t)
+/* Always define miscellaneous pieces when modules are available. */
+#if !__has_feature(modules)
+#define __STDDEF_H
+#endif
+#define __need_ptrdiff_t
+#define __need_size_t
+#define __need_wchar_t
+#define __need_NULL
+#define __need_STDDEF_H_misc
+/* __need_wint_t is intentionally not defined here. */
+#endif
+
+#if defined(__need_ptrdiff_t)
+#if !defined(_PTRDIFF_T) || __has_feature(modules)
+/* Always define ptrdiff_t when modules are available. */
+#if !__has_feature(modules)
+#define _PTRDIFF_T
+#endif
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#endif
+#undef __need_ptrdiff_t
+#endif /* defined(__need_ptrdiff_t) */
+
+#if defined(__need_size_t)
+#if !defined(_SIZE_T) || __has_feature(modules)
+/* Always define size_t when modules are available. */
+#if !__has_feature(modules)
+#define _SIZE_T
+#endif
+typedef __SIZE_TYPE__ size_t;
+#endif
+#undef __need_size_t
+#endif /*defined(__need_size_t) */
+
+#if defined(__need_STDDEF_H_misc)
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
+     !defined(_RSIZE_T)) || __has_feature(modules)
+/* Always define rsize_t when modules are available. */
+#if !__has_feature(modules)
+#define _RSIZE_T
+#endif
+typedef __SIZE_TYPE__ rsize_t;
+#endif
+#endif /* defined(__need_STDDEF_H_misc) */
+
+#if defined(__need_wchar_t)
+#ifndef __cplusplus
+/* Always define wchar_t when modules are available. */
+#if !defined(_WCHAR_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WCHAR_T
+#if defined(_MSC_EXTENSIONS)
+#define _WCHAR_T_DEFINED
+#endif
+#endif
+typedef __WCHAR_TYPE__ wchar_t;
+#endif
+#endif
+#undef __need_wchar_t
+#endif /* defined(__need_wchar_t) */
+
+#if defined(__need_NULL)
+#undef NULL
+#ifdef __cplusplus
+#  if !defined(__MINGW32__) && !defined(_MSC_VER)
+#    define NULL __null
+#  else
+#    define NULL 0
+#  endif
+#else
+#  define NULL ((void*)0)
+#endif
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std { typedef decltype(nullptr) nullptr_t; }
+using ::std::nullptr_t;
+#endif
+#endif
+#undef __need_NULL
+#endif /* defined(__need_NULL) */
+
+#if defined(__need_STDDEF_H_misc)
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#include "__stddef_max_align_t.h"
+#endif
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#undef __need_STDDEF_H_misc
+#endif  /* defined(__need_STDDEF_H_misc) */
+
+/* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
+__WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
+#if defined(__need_wint_t)
+/* Always define wint_t when modules are available. */
+#if !defined(_WINT_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WINT_T
+#endif
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif /* __need_wint_t */
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdint.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdint.h
new file mode 100644
index 000000000..3f2fcbc57
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdint.h
@@ -0,0 +1,707 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdint.h>)
+
+// C99 7.18.3 Limits of other integer types
+//
+//  Footnote 219, 220: C++ implementations should define these macros only when
+//  __STDC_LIMIT_MACROS is defined before <stdint.h> is included.
+//
+//  Footnote 222: C++ implementations should define these macros only when
+//  __STDC_CONSTANT_MACROS is defined before <stdint.h> is included.
+//
+// C++11 [cstdint.syn]p2:
+//
+//  The macros defined by <cstdint> are provided unconditionally. In particular,
+//  the symbols __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS (mentioned in
+//  footnotes 219, 220, and 222 in the C standard) play no role in C++.
+//
+// C11 removed the problematic footnotes.
+//
+// Work around this inconsistency by always defining those macros in C++ mode,
+// so that a C library implementation which follows the C99 standard can be
+// used in C++.
+# ifdef __cplusplus
+#  if !defined(__STDC_LIMIT_MACROS)
+#   define __STDC_LIMIT_MACROS
+#   define __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  endif
+#  if !defined(__STDC_CONSTANT_MACROS)
+#   define __STDC_CONSTANT_MACROS
+#   define __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  endif
+# endif
+
+# include_next <stdint.h>
+
+# ifdef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_LIMIT_MACROS
+#  undef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+# endif
+# ifdef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_CONSTANT_MACROS
+#  undef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+# endif
+
+#else
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ *
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and
+ * 64-bit types if they are implemented. Other exact width types are optional.
+ * This implementation defines an exact-width types for every integer width
+ * that is represented in the standard integer types.
+ *
+ * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
+ * and 64-bit widths regardless of whether there are corresponding exact-width
+ * types.
+ *
+ * To accommodate targets that are missing types that are exactly 8, 16, 32, or
+ * 64 bits wide, this implementation takes an approach of cascading
+ * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * types. It is therefore important that the types are defined in order of
+ * descending widths.
+ *
+ * We currently assume that the minimum-width types and the fastest
+ * minimum-width types are the same. This is allowed by the standard, but is
+ * suboptimal.
+ *
+ * In violation of the standard, some targets do not implement a type that is
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
+ * To accommodate these targets, a required minimum-width type is only
+ * defined if there exists an exact-width type of equal or greater width.
+ */
+
+#ifdef __INT64_TYPE__
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+typedef __INT64_TYPE__ int64_t;
+# endif /* __int8_t_defined */
+typedef __UINT64_TYPE__ uint64_t;
+# define __int_least64_t int64_t
+# define __uint_least64_t uint64_t
+# define __int_least32_t int64_t
+# define __uint_least32_t uint64_t
+# define __int_least16_t int64_t
+# define __uint_least16_t uint64_t
+# define __int_least8_t int64_t
+# define __uint_least8_t uint64_t
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+typedef __int_least64_t int_least64_t;
+typedef __uint_least64_t uint_least64_t;
+typedef __int_least64_t int_fast64_t;
+typedef __uint_least64_t uint_fast64_t;
+#endif /* __int_least64_t */
+
+#ifdef __INT56_TYPE__
+typedef __INT56_TYPE__ int56_t;
+typedef __UINT56_TYPE__ uint56_t;
+typedef int56_t int_least56_t;
+typedef uint56_t uint_least56_t;
+typedef int56_t int_fast56_t;
+typedef uint56_t uint_fast56_t;
+# define __int_least32_t int56_t
+# define __uint_least32_t uint56_t
+# define __int_least16_t int56_t
+# define __uint_least16_t uint56_t
+# define __int_least8_t int56_t
+# define __uint_least8_t uint56_t
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+typedef __INT48_TYPE__ int48_t;
+typedef __UINT48_TYPE__ uint48_t;
+typedef int48_t int_least48_t;
+typedef uint48_t uint_least48_t;
+typedef int48_t int_fast48_t;
+typedef uint48_t uint_fast48_t;
+# define __int_least32_t int48_t
+# define __uint_least32_t uint48_t
+# define __int_least16_t int48_t
+# define __uint_least16_t uint48_t
+# define __int_least8_t int48_t
+# define __uint_least8_t uint48_t
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+typedef __INT40_TYPE__ int40_t;
+typedef __UINT40_TYPE__ uint40_t;
+typedef int40_t int_least40_t;
+typedef uint40_t uint_least40_t;
+typedef int40_t int_fast40_t;
+typedef uint40_t uint_fast40_t;
+# define __int_least32_t int40_t
+# define __uint_least32_t uint40_t
+# define __int_least16_t int40_t
+# define __uint_least16_t uint40_t
+# define __int_least8_t int40_t
+# define __uint_least8_t uint40_t
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+typedef __INT32_TYPE__ int32_t;
+# endif /* __int8_t_defined */
+
+# ifndef __uint32_t_defined  /* more glibc compatibility */
+# define __uint32_t_defined
+typedef __UINT32_TYPE__ uint32_t;
+# endif /* __uint32_t_defined */
+
+# define __int_least32_t int32_t
+# define __uint_least32_t uint32_t
+# define __int_least16_t int32_t
+# define __uint_least16_t uint32_t
+# define __int_least8_t int32_t
+# define __uint_least8_t uint32_t
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+typedef __int_least32_t int_least32_t;
+typedef __uint_least32_t uint_least32_t;
+typedef __int_least32_t int_fast32_t;
+typedef __uint_least32_t uint_fast32_t;
+#endif /* __int_least32_t */
+
+#ifdef __INT24_TYPE__
+typedef __INT24_TYPE__ int24_t;
+typedef __UINT24_TYPE__ uint24_t;
+typedef int24_t int_least24_t;
+typedef uint24_t uint_least24_t;
+typedef int24_t int_fast24_t;
+typedef uint24_t uint_fast24_t;
+# define __int_least16_t int24_t
+# define __uint_least16_t uint24_t
+# define __int_least8_t int24_t
+# define __uint_least8_t uint24_t
+#endif /* __INT24_TYPE__ */
+
+#ifdef __INT16_TYPE__
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int16_t*/
+typedef __INT16_TYPE__ int16_t;
+#endif /* __int8_t_defined */
+typedef __UINT16_TYPE__ uint16_t;
+# define __int_least16_t int16_t
+# define __uint_least16_t uint16_t
+# define __int_least8_t int16_t
+# define __uint_least8_t uint16_t
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+typedef __int_least16_t int_least16_t;
+typedef __uint_least16_t uint_least16_t;
+typedef __int_least16_t int_fast16_t;
+typedef __uint_least16_t uint_fast16_t;
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+typedef __INT8_TYPE__ int8_t;
+#endif /* __int8_t_defined */
+typedef __UINT8_TYPE__ uint8_t;
+# define __int_least8_t int8_t
+# define __uint_least8_t uint8_t
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+typedef __int_least8_t int_least8_t;
+typedef __uint_least8_t uint_least8_t;
+typedef __int_least8_t int_fast8_t;
+typedef __uint_least8_t uint_fast8_t;
+#endif /* __int_least8_t */
+
+/* prevent glibc sys/types.h from defining conflicting types */
+#ifndef __int8_t_defined
+# define __int8_t_defined
+#endif /* __int8_t_defined */
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#define __stdint_join3(a,b,c) a ## b ## c
+
+#define  __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+#ifndef _INTPTR_T
+#ifndef __intptr_t_defined
+typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+#define __intptr_t_defined
+#define _INTPTR_T
+#endif
+#endif
+
+#ifndef _UINTPTR_T
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+#define _UINTPTR_T
+#endif
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__  intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * The standard requires that integer constant macros be defined for all the
+ * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
+ * types are required, the corresponding integer constant macros are defined
+ * here. This implementation also defines minimum-width types for every other
+ * integer width that the target implements, so corresponding macros are
+ * defined below, too.
+ *
+ * These macros are defined using the same successive-shrinking approach as
+ * the type definitions above. It is likewise important that macros are defined
+ * in order of decending width.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define __int_c_join(a, b) a ## b
+#define __int_c(v, suffix) __int_c_join(v, suffix)
+#define __uint_c(v, suffix) __int_c_join(v##U, suffix)
+
+
+#ifdef __INT64_TYPE__
+# ifdef __INT64_C_SUFFIX__
+#  define __int64_c_suffix __INT64_C_SUFFIX__
+#  define __int32_c_suffix __INT64_C_SUFFIX__
+#  define __int16_c_suffix __INT64_C_SUFFIX__
+#  define  __int8_c_suffix __INT64_C_SUFFIX__
+# else
+#  undef __int64_c_suffix
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT64_C_SUFFIX__ */
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+# ifdef __int64_c_suffix
+#  define INT64_C(v) __int_c(v, __int64_c_suffix)
+#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+# else
+#  define INT64_C(v) v
+#  define UINT64_C(v) v ## U
+# endif /* __int64_c_suffix */
+#endif /* __int_least64_t */
+
+
+#ifdef __INT56_TYPE__
+# ifdef __INT56_C_SUFFIX__
+#  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
+#  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
+#  define __int32_c_suffix __INT56_C_SUFFIX__
+#  define __int16_c_suffix __INT56_C_SUFFIX__
+#  define __int8_c_suffix  __INT56_C_SUFFIX__
+# else
+#  define INT56_C(v) v
+#  define UINT56_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT56_C_SUFFIX__ */
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# ifdef __INT48_C_SUFFIX__
+#  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
+#  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
+#  define __int32_c_suffix __INT48_C_SUFFIX__
+#  define __int16_c_suffix __INT48_C_SUFFIX__
+#  define __int8_c_suffix  __INT48_C_SUFFIX__
+# else
+#  define INT48_C(v) v
+#  define UINT48_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT48_C_SUFFIX__ */
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# ifdef __INT40_C_SUFFIX__
+#  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
+#  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
+#  define __int32_c_suffix __INT40_C_SUFFIX__
+#  define __int16_c_suffix __INT40_C_SUFFIX__
+#  define __int8_c_suffix  __INT40_C_SUFFIX__
+# else
+#  define INT40_C(v) v
+#  define UINT40_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT40_C_SUFFIX__ */
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# ifdef __INT32_C_SUFFIX__
+#  define __int32_c_suffix __INT32_C_SUFFIX__
+#  define __int16_c_suffix __INT32_C_SUFFIX__
+#  define __int8_c_suffix  __INT32_C_SUFFIX__
+#else
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT32_C_SUFFIX__ */
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+# ifdef __int32_c_suffix
+#  define INT32_C(v) __int_c(v, __int32_c_suffix)
+#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+# else
+#  define INT32_C(v) v
+#  define UINT32_C(v) v ## U
+# endif /* __int32_c_suffix */
+#endif /* __int_least32_t */
+
+
+#ifdef __INT24_TYPE__
+# ifdef __INT24_C_SUFFIX__
+#  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
+#  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
+#  define __int16_c_suffix __INT24_C_SUFFIX__
+#  define __int8_c_suffix  __INT24_C_SUFFIX__
+# else
+#  define INT24_C(v) v
+#  define UINT24_C(v) v ## U
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT24_C_SUFFIX__ */
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+# ifdef __INT16_C_SUFFIX__
+#  define __int16_c_suffix __INT16_C_SUFFIX__
+#  define __int8_c_suffix  __INT16_C_SUFFIX__
+#else
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT16_C_SUFFIX__ */
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+# ifdef __int16_c_suffix
+#  define INT16_C(v) __int_c(v, __int16_c_suffix)
+#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+# else
+#  define INT16_C(v) v
+#  define UINT16_C(v) v ## U
+# endif /* __int16_c_suffix */
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+# ifdef __INT8_C_SUFFIX__
+#  define __int8_c_suffix __INT8_C_SUFFIX__
+#else
+#  undef  __int8_c_suffix
+# endif /* __INT8_C_SUFFIX__ */
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+# ifdef __int8_c_suffix
+#  define INT8_C(v) __int_c(v, __int8_c_suffix)
+#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+# else
+#  define INT8_C(v) v
+#  define UINT8_C(v) v ## U
+# endif /* __int8_c_suffix */
+#endif /* __int_least8_t */
+
+
+/* C99 7.18.2.1 Limits of exact-width integer types.
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * The presence of limit macros are completely optional in C99.  This
+ * implementation defines limits for all of the types (exact- and
+ * minimum-width) that it defines above, using the limits of the minimum-width
+ * type for any types that do not have exact-width representations.
+ *
+ * As in the type definitions, this section takes an approach of
+ * successive-shrinking to determine which limits to use for the standard (8,
+ * 16, 32, 64) bit widths when they don't have exact representations. It is
+ * therefore important that the defintions be kept in order of decending
+ * widths.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#ifdef __INT64_TYPE__
+# define INT64_MAX           INT64_C( 9223372036854775807)
+# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
+# define UINT64_MAX         UINT64_C(18446744073709551615)
+# define __INT_LEAST64_MIN   INT64_MIN
+# define __INT_LEAST64_MAX   INT64_MAX
+# define __UINT_LEAST64_MAX UINT64_MAX
+# define __INT_LEAST32_MIN   INT64_MIN
+# define __INT_LEAST32_MAX   INT64_MAX
+# define __UINT_LEAST32_MAX UINT64_MAX
+# define __INT_LEAST16_MIN   INT64_MIN
+# define __INT_LEAST16_MAX   INT64_MAX
+# define __UINT_LEAST16_MAX UINT64_MAX
+# define __INT_LEAST8_MIN    INT64_MIN
+# define __INT_LEAST8_MAX    INT64_MAX
+# define __UINT_LEAST8_MAX  UINT64_MAX
+#endif /* __INT64_TYPE__ */
+
+#ifdef __INT_LEAST64_MIN
+# define INT_LEAST64_MIN   __INT_LEAST64_MIN
+# define INT_LEAST64_MAX   __INT_LEAST64_MAX
+# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+# define INT_FAST64_MIN    __INT_LEAST64_MIN
+# define INT_FAST64_MAX    __INT_LEAST64_MAX
+# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#endif /* __INT_LEAST64_MIN */
+
+
+#ifdef __INT56_TYPE__
+# define INT56_MAX           INT56_C(36028797018963967)
+# define INT56_MIN         (-INT56_C(36028797018963967)-1)
+# define UINT56_MAX         UINT56_C(72057594037927935)
+# define INT_LEAST56_MIN     INT56_MIN
+# define INT_LEAST56_MAX     INT56_MAX
+# define UINT_LEAST56_MAX   UINT56_MAX
+# define INT_FAST56_MIN      INT56_MIN
+# define INT_FAST56_MAX      INT56_MAX
+# define UINT_FAST56_MAX    UINT56_MAX
+# define __INT_LEAST32_MIN   INT56_MIN
+# define __INT_LEAST32_MAX   INT56_MAX
+# define __UINT_LEAST32_MAX UINT56_MAX
+# define __INT_LEAST16_MIN   INT56_MIN
+# define __INT_LEAST16_MAX   INT56_MAX
+# define __UINT_LEAST16_MAX UINT56_MAX
+# define __INT_LEAST8_MIN    INT56_MIN
+# define __INT_LEAST8_MAX    INT56_MAX
+# define __UINT_LEAST8_MAX  UINT56_MAX
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# define INT48_MAX           INT48_C(140737488355327)
+# define INT48_MIN         (-INT48_C(140737488355327)-1)
+# define UINT48_MAX         UINT48_C(281474976710655)
+# define INT_LEAST48_MIN     INT48_MIN
+# define INT_LEAST48_MAX     INT48_MAX
+# define UINT_LEAST48_MAX   UINT48_MAX
+# define INT_FAST48_MIN      INT48_MIN
+# define INT_FAST48_MAX      INT48_MAX
+# define UINT_FAST48_MAX    UINT48_MAX
+# define __INT_LEAST32_MIN   INT48_MIN
+# define __INT_LEAST32_MAX   INT48_MAX
+# define __UINT_LEAST32_MAX UINT48_MAX
+# define __INT_LEAST16_MIN   INT48_MIN
+# define __INT_LEAST16_MAX   INT48_MAX
+# define __UINT_LEAST16_MAX UINT48_MAX
+# define __INT_LEAST8_MIN    INT48_MIN
+# define __INT_LEAST8_MAX    INT48_MAX
+# define __UINT_LEAST8_MAX  UINT48_MAX
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# define INT40_MAX           INT40_C(549755813887)
+# define INT40_MIN         (-INT40_C(549755813887)-1)
+# define UINT40_MAX         UINT40_C(1099511627775)
+# define INT_LEAST40_MIN     INT40_MIN
+# define INT_LEAST40_MAX     INT40_MAX
+# define UINT_LEAST40_MAX   UINT40_MAX
+# define INT_FAST40_MIN      INT40_MIN
+# define INT_FAST40_MAX      INT40_MAX
+# define UINT_FAST40_MAX    UINT40_MAX
+# define __INT_LEAST32_MIN   INT40_MIN
+# define __INT_LEAST32_MAX   INT40_MAX
+# define __UINT_LEAST32_MAX UINT40_MAX
+# define __INT_LEAST16_MIN   INT40_MIN
+# define __INT_LEAST16_MAX   INT40_MAX
+# define __UINT_LEAST16_MAX UINT40_MAX
+# define __INT_LEAST8_MIN    INT40_MIN
+# define __INT_LEAST8_MAX    INT40_MAX
+# define __UINT_LEAST8_MAX  UINT40_MAX
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# define INT32_MAX           INT32_C(2147483647)
+# define INT32_MIN         (-INT32_C(2147483647)-1)
+# define UINT32_MAX         UINT32_C(4294967295)
+# define __INT_LEAST32_MIN   INT32_MIN
+# define __INT_LEAST32_MAX   INT32_MAX
+# define __UINT_LEAST32_MAX UINT32_MAX
+# define __INT_LEAST16_MIN   INT32_MIN
+# define __INT_LEAST16_MAX   INT32_MAX
+# define __UINT_LEAST16_MAX UINT32_MAX
+# define __INT_LEAST8_MIN    INT32_MIN
+# define __INT_LEAST8_MAX    INT32_MAX
+# define __UINT_LEAST8_MAX  UINT32_MAX
+#endif /* __INT32_TYPE__ */
+
+#ifdef __INT_LEAST32_MIN
+# define INT_LEAST32_MIN   __INT_LEAST32_MIN
+# define INT_LEAST32_MAX   __INT_LEAST32_MAX
+# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+# define INT_FAST32_MIN    __INT_LEAST32_MIN
+# define INT_FAST32_MAX    __INT_LEAST32_MAX
+# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#endif /* __INT_LEAST32_MIN */
+
+
+#ifdef __INT24_TYPE__
+# define INT24_MAX           INT24_C(8388607)
+# define INT24_MIN         (-INT24_C(8388607)-1)
+# define UINT24_MAX         UINT24_C(16777215)
+# define INT_LEAST24_MIN     INT24_MIN
+# define INT_LEAST24_MAX     INT24_MAX
+# define UINT_LEAST24_MAX   UINT24_MAX
+# define INT_FAST24_MIN      INT24_MIN
+# define INT_FAST24_MAX      INT24_MAX
+# define UINT_FAST24_MAX    UINT24_MAX
+# define __INT_LEAST16_MIN   INT24_MIN
+# define __INT_LEAST16_MAX   INT24_MAX
+# define __UINT_LEAST16_MAX UINT24_MAX
+# define __INT_LEAST8_MIN    INT24_MIN
+# define __INT_LEAST8_MAX    INT24_MAX
+# define __UINT_LEAST8_MAX  UINT24_MAX
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+#define INT16_MAX            INT16_C(32767)
+#define INT16_MIN          (-INT16_C(32767)-1)
+#define UINT16_MAX          UINT16_C(65535)
+# define __INT_LEAST16_MIN   INT16_MIN
+# define __INT_LEAST16_MAX   INT16_MAX
+# define __UINT_LEAST16_MAX UINT16_MAX
+# define __INT_LEAST8_MIN    INT16_MIN
+# define __INT_LEAST8_MAX    INT16_MAX
+# define __UINT_LEAST8_MAX  UINT16_MAX
+#endif /* __INT16_TYPE__ */
+
+#ifdef __INT_LEAST16_MIN
+# define INT_LEAST16_MIN   __INT_LEAST16_MIN
+# define INT_LEAST16_MAX   __INT_LEAST16_MAX
+# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+# define INT_FAST16_MIN    __INT_LEAST16_MIN
+# define INT_FAST16_MAX    __INT_LEAST16_MAX
+# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#endif /* __INT_LEAST16_MIN */
+
+
+#ifdef __INT8_TYPE__
+# define INT8_MAX            INT8_C(127)
+# define INT8_MIN          (-INT8_C(127)-1)
+# define UINT8_MAX          UINT8_C(255)
+# define __INT_LEAST8_MIN    INT8_MIN
+# define __INT_LEAST8_MAX    INT8_MAX
+# define __UINT_LEAST8_MAX  UINT8_MAX
+#endif /* __INT8_TYPE__ */
+
+#ifdef __INT_LEAST8_MIN
+# define INT_LEAST8_MIN   __INT_LEAST8_MIN
+# define INT_LEAST8_MAX   __INT_LEAST8_MAX
+# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+# define INT_FAST8_MIN    __INT_LEAST8_MIN
+# define INT_FAST8_MAX    __INT_LEAST8_MAX
+# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#endif /* __INT_LEAST8_MIN */
+
+/* Some utility macros */
+#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
+#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
+#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
+#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
+#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
+#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
+#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
+#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
+#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+
+/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
+ * is enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
+#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
+#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
+#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
+#ifdef __WINT_UNSIGNED__
+# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
+# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#else
+# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
+# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#endif
+
+#ifndef WCHAR_MAX
+# define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+# else
+#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+# endif
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
+#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdnoreturn.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdnoreturn.h
new file mode 100644
index 000000000..a7a301d7e
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/stdnoreturn.h
@@ -0,0 +1,30 @@
+/*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDNORETURN_H
+#define __STDNORETURN_H
+
+#define noreturn _Noreturn
+#define __noreturn_is_defined 1
+
+#endif /* __STDNORETURN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tbmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tbmintrin.h
new file mode 100644
index 000000000..1d0d746a8
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tbmintrin.h
@@ -0,0 +1,154 @@
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+
+#define __bextri_u32(a, b) \
+  ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
+                                           (unsigned int)(b)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcfill_u32(unsigned int __a)
+{
+  return __a & (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blci_u32(unsigned int __a)
+{
+  return __a | ~(__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcic_u32(unsigned int __a)
+{
+  return ~__a & (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcmsk_u32(unsigned int __a)
+{
+  return __a ^ (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcs_u32(unsigned int __a)
+{
+  return __a | (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsfill_u32(unsigned int __a)
+{
+  return __a | (__a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsic_u32(unsigned int __a)
+{
+  return ~__a | (__a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__t1mskc_u32(unsigned int __a)
+{
+  return ~__a | (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__tzmsk_u32(unsigned int __a)
+{
+  return ~__a & (__a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) \
+  ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
+                                                 (unsigned long long)(b)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcfill_u64(unsigned long long __a)
+{
+  return __a & (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blci_u64(unsigned long long __a)
+{
+  return __a | ~(__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcic_u64(unsigned long long __a)
+{
+  return ~__a & (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcmsk_u64(unsigned long long __a)
+{
+  return __a ^ (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcs_u64(unsigned long long __a)
+{
+  return __a | (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsfill_u64(unsigned long long __a)
+{
+  return __a | (__a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsic_u64(unsigned long long __a)
+{
+  return ~__a | (__a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__t1mskc_u64(unsigned long long __a)
+{
+  return ~__a | (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__tzmsk_u64(unsigned long long __a)
+{
+  return ~__a & (__a - 1);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TBMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tgmath.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tgmath.h
new file mode 100644
index 000000000..318e1185f
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tgmath.h
@@ -0,0 +1,1374 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y)
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// logb
+
+static float
+    _TG_ATTRS
+    __tg_logb(float __x) {return logbf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_logb(double __x) {return logb(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_logb(long double __x) {return logbl(__x);}
+
+#undef logb
+#define logb(__x) __tg_logb(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, long double __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, long double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote1((__x))(__x), (__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tmmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tmmintrin.h
new file mode 100644
index 000000000..80664043a
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/tmmintrin.h
@@ -0,0 +1,773 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
+
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSB instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi8(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSB instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi8(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi16(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi16(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi32(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi32(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+}
+
+/// \brief Concatenates the two 128-bit integer vector operands, and
+///    right-shifts the result by the number of bytes specified in the immediate
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+///    A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param b
+///    A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param n
+///    An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 128-bit integer vector containing the concatenated right-shifted
+///    value.
+#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                     (__v16qi)(__m128i)(b), (n)); })
+
+/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
+///    the result by the number of bytes specified in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+///    A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param b
+///    A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param n
+///    An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 64-bit integer vector containing the concatenated right-shifted
+///    value.
+#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
+  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
+///    operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
+///    operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+///    sums of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+///    sums of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
+///    of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
+///    of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
+///    of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
+///    of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+///    saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+///    differences of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+///    saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+///    differences of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+///    values contained in the first source operand and packed 8-bit signed
+///    integer values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums to
+///    the corresponding bits in the destination. For example, bits [7:0] of
+///    both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the first source operand.
+/// \param __b
+///    A 128-bit integer vector containing the second source operand.
+/// \returns A 128-bit integer vector containing the sums of products of both
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
+///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
+///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
+///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
+///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+///    values contained in the first source operand and packed 8-bit signed
+///    integer values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums to
+///    the corresponding bits in the destination. For example, bits [7:0] of
+///    both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMADDUBSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the first source operand.
+/// \param __b
+///    A 64-bit integer vector containing the second source operand.
+/// \returns A 64-bit integer vector containing the sums of products of both
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULHRSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
+///    products of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULHRSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
+///    products of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSHUFB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes corresponding to
+///    positions in the destination:
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
+///    0: Copy the selected source byte to the corresponding byte in the
+///    destination. \n
+///    Bits [6:4] Reserved.  \n
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 128-bit integer vector containing the copied or cleared values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSHUFB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes corresponding to
+///    positions in the destination:
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
+///    0: Copy the selected source byte to the corresponding byte in the
+///    destination. \n
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 64-bit integer vector containing the copied or cleared values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    byte in the second source is negative, calculate the two's complement of
+///    the corresponding byte in the first source, and write that value to the
+///    destination. If the byte in the second source is positive, copy the
+///    corresponding byte from the first source to the destination. If the byte
+///    in the second source is zero, clear the corresponding byte in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    word in the second source is negative, calculate the two's complement of
+///    the corresponding word in the first source, and write that value to the
+///    destination. If the word in the second source is positive, copy the
+///    corresponding word from the first source to the destination. If the word
+///    in the second source is zero, clear the corresponding word in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control words corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    doubleword in the second source is negative, calculate the two's
+///    complement of the corresponding word in the first source, and write that
+///    value to the destination. If the doubleword in the second source is
+///    positive, copy the corresponding word from the first source to the
+///    destination. If the doubleword in the second source is zero, clear the
+///    corresponding word in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGND instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control doublewords corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    byte in the second source is negative, calculate the two's complement of
+///    the corresponding byte in the first source, and write that value to the
+///    destination. If the byte in the second source is positive, copy the
+///    corresponding byte from the first source to the destination. If the byte
+///    in the second source is zero, clear the corresponding byte in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes corresponding to
+///    positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    word in the second source is negative, calculate the two's complement of
+///    the corresponding word in the first source, and write that value to the
+///    destination. If the word in the second source is positive, copy the
+///    corresponding word from the first source to the destination. If the word
+///    in the second source is zero, clear the corresponding word in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control words corresponding to
+///    positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    doubleword in the second source is negative, calculate the two's
+///    complement of the corresponding doubleword in the first source, and
+///    write that value to the destination. If the doubleword in the second
+///    source is positive, copy the corresponding doubleword from the first
+///    source to the destination. If the doubleword in the second source is
+///    zero, clear the corresponding doubleword in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGND instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing two control doublewords corresponding
+///    to positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/unwind.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/unwind.h
new file mode 100644
index 000000000..4f74a3478
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/unwind.h
@@ -0,0 +1,299 @@
+/*===---- unwind.h - Stack unwinding ----------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* See "Data Definitions for libgcc_s" in the Linux Standard Base.*/
+
+#ifndef __CLANG_UNWIND_H
+#define __CLANG_UNWIND_H
+
+#if defined(__APPLE__) && __has_include_next(<unwind.h>)
+/* Darwin (from 11.x on) provide an unwind.h. If that's available,
+ * use it. libunwind wraps some of its definitions in #ifdef _GNU_SOURCE,
+ * so define that around the include.*/
+# ifndef _GNU_SOURCE
+#  define _SHOULD_UNDEFINE_GNU_SOURCE
+#  define _GNU_SOURCE
+# endif
+// libunwind's unwind.h reflects the current visibility.  However, Mozilla
+// builds with -fvisibility=hidden and relies on gcc's unwind.h to reset the
+// visibility to default and export its contents.  gcc also allows users to
+// override its override by #defining HIDE_EXPORTS (but note, this only obeys
+// the user's -fvisibility setting; it doesn't hide any exports on its own).  We
+// imitate gcc's header here:
+# ifdef HIDE_EXPORTS
+#  include_next <unwind.h>
+# else
+#  pragma GCC visibility push(default)
+#  include_next <unwind.h>
+#  pragma GCC visibility pop
+# endif
+# ifdef _SHOULD_UNDEFINE_GNU_SOURCE
+#  undef _GNU_SOURCE
+#  undef _SHOULD_UNDEFINE_GNU_SOURCE
+# endif
+#else
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* It is a bit strange for a header to play with the visibility of the
+   symbols it declares, but this matches gcc's behavior and some programs
+   depend on it */
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+typedef uintptr_t _Unwind_Word;
+typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Ptr;
+typedef uintptr_t _Unwind_Internal_Ptr;
+typedef uint64_t _Unwind_Exception_Class;
+
+typedef intptr_t _sleb128_t;
+typedef uintptr_t _uleb128_t;
+
+struct _Unwind_Context;
+struct _Unwind_Exception;
+typedef enum {
+  _URC_NO_REASON = 0,
+#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
+    !defined(__ARM_DWARF_EH__)
+  _URC_OK = 0, /* used by ARM EHABI */
+#endif
+  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+
+  _URC_FATAL_PHASE2_ERROR = 2,
+  _URC_FATAL_PHASE1_ERROR = 3,
+  _URC_NORMAL_STOP = 4,
+
+  _URC_END_OF_STACK = 5,
+  _URC_HANDLER_FOUND = 6,
+  _URC_INSTALL_CONTEXT = 7,
+  _URC_CONTINUE_UNWIND = 8,
+#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
+    !defined(__ARM_DWARF_EH__)
+  _URC_FAILURE = 9 /* used by ARM EHABI */
+#endif
+} _Unwind_Reason_Code;
+
+typedef enum {
+  _UA_SEARCH_PHASE = 1,
+  _UA_CLEANUP_PHASE = 2,
+
+  _UA_HANDLER_FRAME = 4,
+  _UA_FORCE_UNWIND = 8,
+  _UA_END_OF_STACK = 16 /* gcc extension to C++ ABI */
+} _Unwind_Action;
+
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
+                                             struct _Unwind_Exception *);
+
+struct _Unwind_Exception {
+  _Unwind_Exception_Class exception_class;
+  _Unwind_Exception_Cleanup_Fn exception_cleanup;
+  _Unwind_Word private_1;
+  _Unwind_Word private_2;
+  /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
+   * aligned".  GCC has interpreted this to mean "use the maximum useful
+   * alignment for the target"; so do we. */
+} __attribute__((__aligned__));
+
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
+                                               _Unwind_Exception_Class,
+                                               struct _Unwind_Exception *,
+                                               struct _Unwind_Context *,
+                                               void *);
+
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
+    struct _Unwind_Context *);
+typedef _Unwind_Personality_Fn __personality_routine;
+
+typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
+                                                void *);
+
+#if defined(__arm__) && !defined(__APPLE__)
+
+typedef enum {
+  _UVRSC_CORE = 0,        /* integer register */
+  _UVRSC_VFP = 1,         /* vfp */
+  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
+  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+} _Unwind_VRS_RegClass;
+
+typedef enum {
+  _UVRSD_UINT32 = 0,
+  _UVRSD_VFPX = 1,
+  _UVRSD_UINT64 = 3,
+  _UVRSD_FLOAT = 4,
+  _UVRSD_DOUBLE = 5
+} _Unwind_VRS_DataRepresentation;
+
+typedef enum {
+  _UVRSR_OK = 0,
+  _UVRSR_NOT_IMPLEMENTED = 1,
+  _UVRSR_FAILED = 2
+} _Unwind_VRS_Result;
+
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
+typedef uint32_t _Unwind_State;
+#define _US_VIRTUAL_UNWIND_FRAME  ((_Unwind_State)0)
+#define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
+#define _US_UNWIND_FRAME_RESUME   ((_Unwind_State)2)
+#define _US_ACTION_MASK           ((_Unwind_State)3)
+#define _US_FORCE_UNWIND          ((_Unwind_State)8)
+#endif
+
+_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+_Unwind_VRS_Result _Unwind_VRS_Set(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+static __inline__
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *__context, int __index) {
+  _Unwind_Word __value;
+  _Unwind_VRS_Get(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+  return __value;
+}
+
+static __inline__
+void _Unwind_SetGR(struct _Unwind_Context *__context, int __index,
+                   _Unwind_Word __value) {
+  _Unwind_VRS_Set(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+}
+
+static __inline__
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *__context) {
+  _Unwind_Word __ip = _Unwind_GetGR(__context, 15);
+  return __ip & ~(_Unwind_Word)(0x1); /* Remove thumb mode bit. */
+}
+
+static __inline__
+void _Unwind_SetIP(struct _Unwind_Context *__context, _Unwind_Word __value) {
+  _Unwind_Word __thumb_mode_bit = _Unwind_GetGR(__context, 15) & 0x1;
+  _Unwind_SetGR(__context, 15, __value | __thumb_mode_bit);
+}
+#else
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *, int);
+void _Unwind_SetGR(struct _Unwind_Context *, int, _Unwind_Word);
+
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *);
+void _Unwind_SetIP(struct _Unwind_Context *, _Unwind_Word);
+#endif
+
+
+_Unwind_Word _Unwind_GetIPInfo(struct _Unwind_Context *, int *);
+
+_Unwind_Word _Unwind_GetCFA(struct _Unwind_Context *);
+
+_Unwind_Word _Unwind_GetBSP(struct _Unwind_Context *);
+
+void *_Unwind_GetLanguageSpecificData(struct _Unwind_Context *);
+
+_Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
+
+/* DWARF EH functions; currently not available on Darwin/ARM */
+#if !defined(__APPLE__) || !defined(__arm__)
+
+_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
+                                         _Unwind_Stop_Fn, void *);
+void _Unwind_DeleteException(struct _Unwind_Exception *);
+void _Unwind_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+#endif
+
+_Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+
+/* setjmp(3)/longjmp(3) stuff */
+typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
+
+void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
+void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+                                              _Unwind_Stop_Fn, void *);
+void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+void *_Unwind_FindEnclosingFunction(void *);
+
+#ifdef __APPLE__
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+
+/* Darwin-specific functions */
+void __register_frame(const void *);
+void __deregister_frame(const void *);
+
+struct dwarf_eh_bases {
+  uintptr_t tbase;
+  uintptr_t dbase;
+  uintptr_t func;
+};
+void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
+
+void __register_frame_info_bases(const void *, void *, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info(const void *, void *) __attribute__((__unavailable__));
+void __register_frame_info_table_bases(const void *, void*, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info_table(const void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_table(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info_bases(const void *)__attribute__((__unavailable__));
+
+#else
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *);
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *);
+
+#endif
+
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __CLANG_UNWIND_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/vadefs.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/vadefs.h
new file mode 100644
index 000000000..7fe9a74e3
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/vadefs.h
@@ -0,0 +1,65 @@
+/* ===-------- vadefs.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we are aiming for MSVC compatibility. */
+#ifndef _MSC_VER
+#include_next <vadefs.h>
+#else
+
+#ifndef __clang_vadefs_h
+#define __clang_vadefs_h
+
+#include_next <vadefs.h>
+
+/* Override macros from vadefs.h with definitions that work with Clang. */
+#ifdef _crt_va_start
+#undef _crt_va_start
+#define _crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef _crt_va_end
+#undef _crt_va_end
+#define _crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef _crt_va_arg
+#undef _crt_va_arg
+#define _crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+/* VS 2015 switched to double underscore names, which is an improvement, but now
+ * we have to intercept those names too.
+ */
+#ifdef __crt_va_start
+#undef __crt_va_start
+#define __crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef __crt_va_end
+#undef __crt_va_end
+#define __crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef __crt_va_arg
+#undef __crt_va_arg
+#define __crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+#endif
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/varargs.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/varargs.h
new file mode 100644
index 000000000..b5477d0a6
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/varargs.h
@@ -0,0 +1,26 @@
+/*===---- varargs.h - Variable argument handling -------------------------------------===
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+*===-----------------------------------------------------------------------===
+*/
+#ifndef __VARARGS_H
+#define __VARARGS_H
+  #error "Please use <stdarg.h> instead of <varargs.h>"
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/vecintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/vecintrin.h
new file mode 100644
index 000000000..ca7acb473
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/vecintrin.h
@@ -0,0 +1,8946 @@
+/*===---- vecintrin.h - Vector intrinsics ----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if defined(__s390x__) && defined(__VEC__)
+
+#define __ATTRS_ai __attribute__((__always_inline__))
+#define __ATTRS_o __attribute__((__overloadable__))
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+#define __constant(PARM) \
+  __attribute__((__enable_if__ ((PARM) == (PARM), \
+     "argument must be a constant integer")))
+#define __constant_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH), \
+     "argument must be a constant integer from " #LOW " to " #HIGH)))
+#define __constant_pow2_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH) && \
+                                ((PARM) & ((PARM) - 1)) == 0, \
+     "argument must be a constant power of 2 from " #LOW " to " #HIGH)))
+
+/*-- __lcbb -----------------------------------------------------------------*/
+
+extern __ATTRS_o unsigned int
+__lcbb(const void *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define __lcbb(X, Y) ((__typeof__((__lcbb)((X), (Y)))) \
+  __builtin_s390_lcbb((X), __builtin_constant_p((Y))? \
+                           ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : 0) : 0))
+
+/*-- vec_extract ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai signed char
+vec_extract(vector signed char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector bool char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector unsigned char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai signed short
+vec_extract(vector signed short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector bool short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector unsigned short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai signed int
+vec_extract(vector signed int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector bool int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector unsigned int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai signed long long
+vec_extract(vector signed long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector bool long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector unsigned long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai double
+vec_extract(vector double __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+/*-- vec_insert -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert(signed char __scalar, vector signed char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
+  vector unsigned char __newvec = (vector unsigned char)__vec;
+  __newvec[__index & 15] = (unsigned char)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector unsigned char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert(signed short __scalar, vector signed short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
+  vector unsigned short __newvec = (vector unsigned short)__vec;
+  __newvec[__index & 7] = (unsigned short)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector unsigned short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert(signed int __scalar, vector signed int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
+  vector unsigned int __newvec = (vector unsigned int)__vec;
+  __newvec[__index & 3] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector unsigned int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert(signed long long __scalar, vector signed long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector bool long long __vec,
+           int __index) {
+  vector unsigned long long __newvec = (vector unsigned long long)__vec;
+  __newvec[__index & 1] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector unsigned long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert(double __scalar, vector double __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_promote ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_promote(signed char __scalar, int __index) {
+  const vector signed char __zero = (vector signed char)0;
+  vector signed char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_promote(unsigned char __scalar, int __index) {
+  const vector unsigned char __zero = (vector unsigned char)0;
+  vector unsigned char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_promote(signed short __scalar, int __index) {
+  const vector signed short __zero = (vector signed short)0;
+  vector signed short __vec = __builtin_shufflevector(__zero, __zero,
+                                -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_promote(unsigned short __scalar, int __index) {
+  const vector unsigned short __zero = (vector unsigned short)0;
+  vector unsigned short __vec = __builtin_shufflevector(__zero, __zero,
+                                  -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_promote(signed int __scalar, int __index) {
+  const vector signed int __zero = (vector signed int)0;
+  vector signed int __vec = __builtin_shufflevector(__zero, __zero,
+                                                    -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_promote(unsigned int __scalar, int __index) {
+  const vector unsigned int __zero = (vector unsigned int)0;
+  vector unsigned int __vec = __builtin_shufflevector(__zero, __zero,
+                                                      -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_promote(signed long long __scalar, int __index) {
+  const vector signed long long __zero = (vector signed long long)0;
+  vector signed long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                          -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_promote(unsigned long long __scalar, int __index) {
+  const vector unsigned long long __zero = (vector unsigned long long)0;
+  vector unsigned long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                            -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_promote(double __scalar, int __index) {
+  const vector double __zero = (vector double)0;
+  vector double __vec = __builtin_shufflevector(__zero, __zero, -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_insert_and_zero ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert_and_zero(const signed char *__ptr) {
+  vector signed char __vec = (vector signed char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert_and_zero(const unsigned char *__ptr) {
+  vector unsigned char __vec = (vector unsigned char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert_and_zero(const signed short *__ptr) {
+  vector signed short __vec = (vector signed short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert_and_zero(const unsigned short *__ptr) {
+  vector unsigned short __vec = (vector unsigned short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert_and_zero(const signed int *__ptr) {
+  vector signed int __vec = (vector signed int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert_and_zero(const unsigned int *__ptr) {
+  vector unsigned int __vec = (vector unsigned int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert_and_zero(const signed long long *__ptr) {
+  vector signed long long __vec = (vector signed long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert_and_zero(const unsigned long long *__ptr) {
+  vector unsigned long long __vec = (vector unsigned long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert_and_zero(const double *__ptr) {
+  vector double __vec = (vector double)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+/*-- vec_perm ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_perm(vector signed char __a, vector signed char __b,
+         vector unsigned char __c) {
+  return (vector signed char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+  return (vector unsigned char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_perm(vector bool char __a, vector bool char __b,
+         vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_perm(vector signed short __a, vector signed short __b,
+         vector unsigned char __c) {
+  return (vector signed short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
+  return (vector unsigned short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_perm(vector bool short __a, vector bool short __b,
+         vector unsigned char __c) {
+  return (vector bool short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_perm(vector signed int __a, vector signed int __b,
+         vector unsigned char __c) {
+  return (vector signed int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
+  return (vector unsigned int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_perm(vector bool int __a, vector bool int __b,
+         vector unsigned char __c) {
+  return (vector bool int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
+  return (vector signed long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+  return (vector unsigned long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+  return (vector bool long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_perm(vector double __a, vector double __b,
+         vector unsigned char __c) {
+  return (vector double)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+/*-- vec_permi --------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed long long
+vec_permi(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector bool long long
+vec_permi(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_permi(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_permi(X, Y, Z) ((__typeof__((vec_permi)((X), (Y), (Z)))) \
+  __builtin_s390_vpdi((vector unsigned long long)(X), \
+                      (vector unsigned long long)(Y), \
+                      (((Z) & 2) << 1) | ((Z) & 1)))
+
+/*-- vec_sel ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b,
+        vector unsigned char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return ((vector bool char)__c & __b) | (~(vector bool char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector bool char __c) {
+  return ((vector unsigned char)__c & __b) | (~(vector unsigned char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector unsigned short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector bool short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b,
+        vector unsigned short __c) {
+  return ((vector bool short)__c & __b) | (~(vector bool short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
+  return (((vector unsigned short)__c & __b) |
+          (~(vector unsigned short)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b,
+        vector unsigned int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b, vector bool int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return ((vector bool int)__c & __b) | (~(vector bool int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b,
+        vector unsigned int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return ((vector unsigned int)__c & __b) | (~(vector unsigned int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector unsigned long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector bool long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector unsigned long long __c) {
+  return (((vector bool long long)__c & __b) |
+          (~(vector bool long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector bool long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector unsigned long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector bool long long __c) {
+  return (((vector unsigned long long)__c & __b) |
+          (~(vector unsigned long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
+  return (vector double)((__c & (vector unsigned long long)__b) |
+                         (~__c & (vector unsigned long long)__a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  vector unsigned long long __cc = (vector unsigned long long)__c;
+  return (vector double)((__cc & __bc) | (~__cc & __ac));
+}
+
+/*-- vec_gather_element -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed int
+vec_gather_element(vector signed int __vec, vector unsigned int __offset,
+                   const signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const signed int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_gather_element(vector bool int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gather_element(vector unsigned int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_gather_element(vector signed long long __vec,
+                   vector unsigned long long __offset,
+                   const signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const signed long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_gather_element(vector bool long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gather_element(vector unsigned long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_gather_element(vector double __vec, vector unsigned long long __offset,
+                   const double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const double *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+/*-- vec_scatter_element ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed int __vec, vector unsigned int __offset,
+                    signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(signed int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed long long __vec,
+                    vector unsigned long long __offset,
+                    signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(signed long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector double __vec, vector unsigned long long __offset,
+                    double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(double *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+/*-- vec_xld2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xld2(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xld2(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xld2(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xld2(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xld2(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xld2(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xld2(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xld2(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_xld2(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xlw4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xlw4(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xlw4(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xlw4(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xlw4(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xlw4(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xlw4(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xstd2 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_xstw4 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_load_bndry ---------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_load_bndry(const signed char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned char
+vec_load_bndry(const unsigned char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed short
+vec_load_bndry(const signed short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned short
+vec_load_bndry(const unsigned short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed int
+vec_load_bndry(const signed int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned int
+vec_load_bndry(const unsigned int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed long long
+vec_load_bndry(const signed long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned long long
+vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector double
+vec_load_bndry(const double *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define vec_load_bndry(X, Y) ((__typeof__((vec_load_bndry)((X), (Y)))) \
+  __builtin_s390_vlbb((X), ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : -1)))
+
+/*-- vec_load_len -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_load_len(const signed char *__ptr, unsigned int __len) {
+  return (vector signed char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_load_len(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_load_len(const signed short *__ptr, unsigned int __len) {
+  return (vector signed short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_load_len(const unsigned short *__ptr, unsigned int __len) {
+  return (vector unsigned short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_load_len(const signed int *__ptr, unsigned int __len) {
+  return (vector signed int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_load_len(const unsigned int *__ptr, unsigned int __len) {
+  return (vector unsigned int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_len(const signed long long *__ptr, unsigned int __len) {
+  return (vector signed long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
+  return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_load_len(const double *__ptr, unsigned int __len) {
+  return (vector double)__builtin_s390_vll(__len, __ptr);
+}
+
+/*-- vec_store_len ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed char __vec, signed char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned char __vec, unsigned char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed short __vec, signed short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned short __vec, unsigned short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed int __vec, signed int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned int __vec, unsigned int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed long long __vec, signed long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector double __vec, double *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+/*-- vec_load_pair ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_pair(signed long long __a, signed long long __b) {
+  return (vector signed long long)(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_pair(unsigned long long __a, unsigned long long __b) {
+  return (vector unsigned long long)(__a, __b);
+}
+
+/*-- vec_genmask ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmask(unsigned short __mask)
+  __constant(__mask) {
+  return (vector unsigned char)(
+    __mask & 0x8000 ? 0xff : 0,
+    __mask & 0x4000 ? 0xff : 0,
+    __mask & 0x2000 ? 0xff : 0,
+    __mask & 0x1000 ? 0xff : 0,
+    __mask & 0x0800 ? 0xff : 0,
+    __mask & 0x0400 ? 0xff : 0,
+    __mask & 0x0200 ? 0xff : 0,
+    __mask & 0x0100 ? 0xff : 0,
+    __mask & 0x0080 ? 0xff : 0,
+    __mask & 0x0040 ? 0xff : 0,
+    __mask & 0x0020 ? 0xff : 0,
+    __mask & 0x0010 ? 0xff : 0,
+    __mask & 0x0008 ? 0xff : 0,
+    __mask & 0x0004 ? 0xff : 0,
+    __mask & 0x0002 ? 0xff : 0,
+    __mask & 0x0001 ? 0xff : 0);
+}
+
+/*-- vec_genmasks_* ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmasks_8(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 7;
+  unsigned char __bit2 = __last & 7;
+  unsigned char __mask1 = (unsigned char)(1U << (7 - __bit1) << 1) - 1;
+  unsigned char __mask2 = (unsigned char)(1U << (7 - __bit2)) - 1;
+  unsigned char __value = (__bit1 <= __bit2 ?
+                           __mask1 & ~__mask2 :
+                           __mask1 | ~__mask2);
+  return (vector unsigned char)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_genmasks_16(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 15;
+  unsigned char __bit2 = __last & 15;
+  unsigned short __mask1 = (unsigned short)(1U << (15 - __bit1) << 1) - 1;
+  unsigned short __mask2 = (unsigned short)(1U << (15 - __bit2)) - 1;
+  unsigned short __value = (__bit1 <= __bit2 ?
+                            __mask1 & ~__mask2 :
+                            __mask1 | ~__mask2);
+  return (vector unsigned short)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_genmasks_32(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 31;
+  unsigned char __bit2 = __last & 31;
+  unsigned int __mask1 = (1U << (31 - __bit1) << 1) - 1;
+  unsigned int __mask2 = (1U << (31 - __bit2)) - 1;
+  unsigned int __value = (__bit1 <= __bit2 ?
+                          __mask1 & ~__mask2 :
+                          __mask1 | ~__mask2);
+  return (vector unsigned int)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_genmasks_64(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 63;
+  unsigned char __bit2 = __last & 63;
+  unsigned long long __mask1 = (1ULL << (63 - __bit1) << 1) - 1;
+  unsigned long long __mask2 = (1ULL << (63 - __bit2)) - 1;
+  unsigned long long __value = (__bit1 <= __bit2 ?
+                                __mask1 & ~__mask2 :
+                                __mask1 | ~__mask2);
+  return (vector unsigned long long)__value;
+}
+
+/*-- vec_splat --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splat(vector signed char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector signed char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_splat(vector bool char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector bool char)(vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splat(vector unsigned char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splat(vector signed short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector signed short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_splat(vector bool short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector bool short)(vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splat(vector unsigned short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splat(vector signed int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector signed int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_splat(vector bool int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector bool int)(vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splat(vector unsigned int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splat(vector signed long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector signed long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_splat(vector bool long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector bool long long)(vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splat(vector unsigned long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splat(vector double __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector double)__vec[__index];
+}
+
+/*-- vec_splat_s* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector signed char
+vec_splat_s8(signed char __scalar)
+  __constant(__scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_ai vector signed short
+vec_splat_s16(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_ai vector signed int
+vec_splat_s32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector signed long long
+vec_splat_s64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed long long)(signed long)__scalar;
+}
+
+/*-- vec_splat_u* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_splat_u8(unsigned char __scalar)
+  __constant(__scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned short
+vec_splat_u16(unsigned short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned int
+vec_splat_u32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned long long
+vec_splat_u64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned long long)(signed long long)__scalar;
+}
+
+/*-- vec_splats -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splats(signed char __scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splats(unsigned char __scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splats(signed short __scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splats(unsigned short __scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splats(signed int __scalar) {
+  return (vector signed int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splats(unsigned int __scalar) {
+  return (vector unsigned int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splats(signed long long __scalar) {
+  return (vector signed long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splats(unsigned long long __scalar) {
+  return (vector unsigned long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splats(double __scalar) {
+  return (vector double)__scalar;
+}
+
+/*-- vec_extend_s64 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed char __a) {
+  return (vector signed long long)(__a[7], __a[15]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed short __a) {
+  return (vector signed long long)(__a[3], __a[7]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed int __a) {
+  return (vector signed long long)(__a[1], __a[3]);
+}
+
+/*-- vec_mergeh -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergeh(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergeh(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergeh(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergeh(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergeh(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergeh(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergeh(vector double __a, vector double __b) {
+  return (vector double)(__a[0], __b[0]);
+}
+
+/*-- vec_mergel -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergel(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergel(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergel(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergel(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergel(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergel(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergel(vector double __a, vector double __b) {
+  return (vector double)(__a[1], __b[1]);
+}
+
+/*-- vec_pack ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_pack(vector signed short __a, vector signed short __b) {
+  vector signed char __ac = (vector signed char)__a;
+  vector signed char __bc = (vector signed char)__b;
+  return (vector signed char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_pack(vector bool short __a, vector bool short __b) {
+  vector bool char __ac = (vector bool char)__a;
+  vector bool char __bc = (vector bool char)__b;
+  return (vector bool char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return (vector unsigned char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_pack(vector signed int __a, vector signed int __b) {
+  vector signed short __ac = (vector signed short)__a;
+  vector signed short __bc = (vector signed short)__b;
+  return (vector signed short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_pack(vector bool int __a, vector bool int __b) {
+  vector bool short __ac = (vector bool short)__a;
+  vector bool short __bc = (vector bool short)__b;
+  return (vector bool short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return (vector unsigned short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_pack(vector signed long long __a, vector signed long long __b) {
+  vector signed int __ac = (vector signed int)__a;
+  vector signed int __bc = (vector signed int)__b;
+  return (vector signed int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_pack(vector bool long long __a, vector bool long long __b) {
+  vector bool int __ac = (vector bool int)__a;
+  vector bool int __bc = (vector bool int)__b;
+  return (vector bool int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return (vector unsigned int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+/*-- vec_packs --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vpksh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vpksf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vpksg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packs_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return __builtin_s390_vpkshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return __builtin_s390_vpksfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs_cc(vector signed long long __a, vector signed long long __b,
+             int *__cc) {
+  return __builtin_s390_vpksgs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs_cc(vector unsigned long long __a, vector unsigned long long __b,
+             int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_packsu -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector signed short __a, vector signed short __b) {
+  const vector signed short __zero = (vector signed short)0;
+  return __builtin_s390_vpklsh(
+    (vector unsigned short)(__a >= __zero) & (vector unsigned short)__a,
+    (vector unsigned short)(__b >= __zero) & (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector signed int __a, vector signed int __b) {
+  const vector signed int __zero = (vector signed int)0;
+  return __builtin_s390_vpklsf(
+    (vector unsigned int)(__a >= __zero) & (vector unsigned int)__a,
+    (vector unsigned int)(__b >= __zero) & (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector signed long long __a, vector signed long long __b) {
+  const vector signed long long __zero = (vector signed long long)0;
+  return __builtin_s390_vpklsg(
+    (vector unsigned long long)(__a >= __zero) &
+    (vector unsigned long long)__a,
+    (vector unsigned long long)(__b >= __zero) &
+    (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packsu_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu_cc(vector unsigned long long __a, vector unsigned long long __b,
+              int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_unpackh ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackh(vector signed char __a) {
+  return __builtin_s390_vuphb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackh(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuphb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackh(vector unsigned char __a) {
+  return __builtin_s390_vuplhb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackh(vector signed short __a) {
+  return __builtin_s390_vuphh(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackh(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuphh((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackh(vector unsigned short __a) {
+  return __builtin_s390_vuplhh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackh(vector signed int __a) {
+  return __builtin_s390_vuphf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackh(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuphf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackh(vector unsigned int __a) {
+  return __builtin_s390_vuplhf(__a);
+}
+
+/*-- vec_unpackl ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackl(vector signed char __a) {
+  return __builtin_s390_vuplb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackl(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuplb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackl(vector unsigned char __a) {
+  return __builtin_s390_vupllb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackl(vector signed short __a) {
+  return __builtin_s390_vuplhw(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackl(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuplhw((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackl(vector unsigned short __a) {
+  return __builtin_s390_vupllh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackl(vector signed int __a) {
+  return __builtin_s390_vuplf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackl(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuplf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackl(vector unsigned int __a) {
+  return __builtin_s390_vupllf(__a);
+}
+
+/*-- vec_cmpeq --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+/*-- vec_cmpge --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+/*-- vec_cmpgt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+/*-- vec_cmple --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector double __a, vector double __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+/*-- vec_cmplt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+/*-- vec_all_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_any_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 3;
+}
+
+/*-- vec_any_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_andc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_andc(vector bool char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector bool char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector bool char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_andc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector bool short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector bool short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_andc(vector bool int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector bool int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector bool int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_andc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+/*-- vec_nor ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_nor(vector bool char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector bool char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector bool char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector bool short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector bool short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nor(vector bool int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector bool int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector bool int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nor(vector bool long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector bool long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector bool long long __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector bool long long __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+/*-- vec_cntlz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector signed char __a) {
+  return __builtin_s390_vclzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector unsigned char __a) {
+  return __builtin_s390_vclzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector signed short __a) {
+  return __builtin_s390_vclzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector unsigned short __a) {
+  return __builtin_s390_vclzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector signed int __a) {
+  return __builtin_s390_vclzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector unsigned int __a) {
+  return __builtin_s390_vclzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector signed long long __a) {
+  return __builtin_s390_vclzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_s390_vclzg(__a);
+}
+
+/*-- vec_cnttz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector signed char __a) {
+  return __builtin_s390_vctzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_s390_vctzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector signed short __a) {
+  return __builtin_s390_vctzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_s390_vctzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector signed int __a) {
+  return __builtin_s390_vctzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_s390_vctzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector signed long long __a) {
+  return __builtin_s390_vctzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_s390_vctzg(__a);
+}
+
+/*-- vec_popcnt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector signed char __a) {
+  return __builtin_s390_vpopctb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_s390_vpopctb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector signed short __a) {
+  return __builtin_s390_vpopcth((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_s390_vpopcth(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector signed int __a) {
+  return __builtin_s390_vpopctf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_s390_vpopctf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector signed long long __a) {
+  return __builtin_s390_vpopctg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_s390_vpopctg(__a);
+}
+
+/*-- vec_rl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_verllvb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_verllvb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_verllvh(
+    (vector unsigned short)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_verllvh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_verllvf(
+    (vector unsigned int)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_verllvf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_verllvg(
+    (vector unsigned long long)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_verllvg(__a, __b);
+}
+
+/*-- vec_rli ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rli(vector signed char __a, unsigned long __b) {
+  return (vector signed char)__builtin_s390_verllb(
+    (vector unsigned char)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rli(vector unsigned char __a, unsigned long __b) {
+  return __builtin_s390_verllb(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rli(vector signed short __a, unsigned long __b) {
+  return (vector signed short)__builtin_s390_verllh(
+    (vector unsigned short)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rli(vector unsigned short __a, unsigned long __b) {
+  return __builtin_s390_verllh(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rli(vector signed int __a, unsigned long __b) {
+  return (vector signed int)__builtin_s390_verllf(
+    (vector unsigned int)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rli(vector unsigned int __a, unsigned long __b) {
+  return __builtin_s390_verllf(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rli(vector signed long long __a, unsigned long __b) {
+  return (vector signed long long)__builtin_s390_verllg(
+    (vector unsigned long long)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rli(vector unsigned long long __a, unsigned long __b) {
+  return __builtin_s390_verllg(__a, (int)__b);
+}
+
+/*-- vec_rl_mask ------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_rl_mask(vector signed char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned char
+vec_rl_mask(vector unsigned char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed short
+vec_rl_mask(vector signed short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned short
+vec_rl_mask(vector unsigned short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed int
+vec_rl_mask(vector signed int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned int
+vec_rl_mask(vector unsigned int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed long long
+vec_rl_mask(vector signed long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned long long
+vec_rl_mask(vector unsigned long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+#define vec_rl_mask(X, Y, Z) ((__typeof__((vec_rl_mask)((X), (Y), (Z)))) \
+  __extension__ ({ \
+    vector unsigned char __res; \
+    vector unsigned char __x = (vector unsigned char)(X); \
+    vector unsigned char __y = (vector unsigned char)(Y); \
+    switch (sizeof ((X)[0])) { \
+    case 1: __res = (vector unsigned char) __builtin_s390_verimb( \
+             (vector unsigned char)__x, (vector unsigned char)__x, \
+             (vector unsigned char)__y, (Z)); break; \
+    case 2: __res = (vector unsigned char) __builtin_s390_verimh( \
+             (vector unsigned short)__x, (vector unsigned short)__x, \
+             (vector unsigned short)__y, (Z)); break; \
+    case 4: __res = (vector unsigned char) __builtin_s390_verimf( \
+             (vector unsigned int)__x, (vector unsigned int)__x, \
+             (vector unsigned int)__y, (Z)); break; \
+    default: __res = (vector unsigned char) __builtin_s390_verimg( \
+             (vector unsigned long long)__x, (vector unsigned long long)__x, \
+             (vector unsigned long long)__y, (Z)); break; \
+    } __res; }))
+
+/*-- vec_sll ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_slb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vslb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vslb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_sld ----------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sld(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned char
+vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed short
+vec_sld(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned short
+vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed int
+vec_sld(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned int
+vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed long long
+vec_sld(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned long long
+vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector double
+vec_sld(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 15);
+
+#define vec_sld(X, Y, Z) ((__typeof__((vec_sld)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z)))
+
+/*-- vec_sldw ---------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sldw(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned char
+vec_sldw(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed short
+vec_sldw(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned short
+vec_sldw(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed int
+vec_sldw(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned int
+vec_sldw(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed long long
+vec_sldw(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_sldw(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_sldw(X, Y, Z) ((__typeof__((vec_sldw)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z) * 4))
+
+/*-- vec_sral ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsra(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srab ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrab(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrab(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srl ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrlb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrlb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_abs ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_abs(vector signed char __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed char)0));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_abs(vector signed short __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed short)0));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_abs(vector signed int __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed int)0));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_abs(vector signed long long __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_abs(vector double __a) {
+  return __builtin_s390_vflpdb(__a);
+}
+
+/*-- vec_nabs ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_nabs(vector double __a) {
+  return __builtin_s390_vflndb(__a);
+}
+
+/*-- vec_max ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector signed char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector signed short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector signed int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_max(vector double __a, vector double __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_min ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector signed char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector signed short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector signed int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_min(vector double __a, vector double __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_add_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_add_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaq(__a, __b);
+}
+
+/*-- vec_addc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_addc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_addc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vacch(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vaccf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_addc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vaccg(__a, __b);
+}
+
+/*-- vec_addc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccq(__a, __b);
+}
+
+/*-- vec_adde_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_adde_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vacq(__a, __b, __c);
+}
+
+/*-- vec_addec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vacccq(__a, __b, __c);
+}
+
+/*-- vec_avg ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_avg(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vavgb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_avg(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vavgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_avg(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vavgf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_avg(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vavgg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vavglb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vavglh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vavglf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_avg(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vavglg(__a, __b);
+}
+
+/*-- vec_checksum -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned int
+vec_checksum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vcksm(__a, __b);
+}
+
+/*-- vec_gfmsum -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vgfmb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vgfmh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vgfmf(__a, __b);
+}
+
+/*-- vec_gfmsum_128 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vgfmg(__a, __b);
+}
+
+/*-- vec_gfmsum_accum -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum_accum(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned short __c) {
+  return __builtin_s390_vgfmab(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum_accum(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned int __c) {
+  return __builtin_s390_vgfmah(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum_accum(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned long long __c) {
+  return __builtin_s390_vgfmaf(__a, __b, __c);
+}
+
+/*-- vec_gfmsum_accum_128 ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_accum_128(vector unsigned long long __a,
+                     vector unsigned long long __b,
+                     vector unsigned char __c) {
+  return __builtin_s390_vgfmag(__a, __b, __c);
+}
+
+/*-- vec_mladd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector unsigned char __a, vector signed char __b,
+          vector signed char __c) {
+  return (vector signed char)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * (vector signed char)__b + (vector signed char)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mladd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector unsigned short __a, vector signed short __b,
+          vector signed short __c) {
+  return (vector signed short)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * (vector signed short)__b + (vector signed short)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector unsigned int __a, vector signed int __b,
+          vector signed int __c) {
+  return (vector signed int)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * (vector signed int)__b + (vector signed int)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mladd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * __b + __c;
+}
+
+/*-- vec_mhadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mhadd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __builtin_s390_vmahb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mhadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __builtin_s390_vmalhb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mhadd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_s390_vmahh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mhadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalhh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mhadd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __builtin_s390_vmahf(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mhadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmalhf(__a, __b, __c);
+}
+
+/*-- vec_meadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_meadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaeb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_meadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmaleb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_meadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaeh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_meadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaleh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_meadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaef(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_meadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalef(__a, __b, __c);
+}
+
+/*-- vec_moadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_moadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_moadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_moadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaoh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_moadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaloh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_moadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaof(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_moadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalof(__a, __b, __c);
+}
+
+/*-- vec_mulh ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mulh(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mulh(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulh(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmlhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulh(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmhf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulh(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlhf(__a, __b);
+}
+
+/*-- vec_mule ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mule(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmleb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mule(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmleh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mule(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmef(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlef(__a, __b);
+}
+
+/*-- vec_mulo ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulo(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulo(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmoh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmloh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mulo(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmof(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlof(__a, __b);
+}
+
+/*-- vec_sub_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sub_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsq(__a, __b);
+}
+
+/*-- vec_subc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_subc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbib(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_subc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vscbih(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vscbif(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_subc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vscbig(__a, __b);
+}
+
+/*-- vec_subc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbiq(__a, __b);
+}
+
+/*-- vec_sube_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sube_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vsbiq(__a, __b, __c);
+}
+
+/*-- vec_subec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vsbcbiq(__a, __b, __c);
+}
+
+/*-- vec_sum2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumgf(__a, __b);
+}
+
+/*-- vec_sum_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumqf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vsumqg(__a, __b);
+}
+
+/*-- vec_sum4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsumb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumh(__a, __b);
+}
+
+/*-- vec_test_mask ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm(__a, __b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector double __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+/*-- vec_madd ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmadb(__a, __b, __c);
+}
+
+/*-- vec_msub ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmsdb(__a, __b, __c);
+}
+
+/*-- vec_sqrt ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_sqrt(vector double __a) {
+  return __builtin_s390_vfsqdb(__a);
+}
+
+/*-- vec_ld2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ld2f(const float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  return __builtin_convertvector(*(const __v2f32 *)__ptr, vector double);
+}
+
+/*-- vec_st2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai void
+vec_st2f(vector double __a, float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  *(__v2f32 *)__ptr = __builtin_convertvector(__a, __v2f32);
+}
+
+/*-- vec_ctd ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector signed long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector unsigned long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+/*-- vec_ctsl ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_ctsl(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_ctul ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_ctul(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+/*-- vec_roundp -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundp(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_ceil ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ceil(vector double __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_roundm -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundm(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_floor --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_floor(vector double __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_roundz -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundz(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_trunc --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_trunc(vector double __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_roundc -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundc(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 0);
+}
+
+/*-- vec_round --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_round(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 4);
+}
+
+/*-- vec_fp_test_data_class -------------------------------------------------*/
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+
+/*-- vec_cp_until_zero ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero(vector signed char __a) {
+  return (vector signed char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero(vector bool char __a) {
+  return (vector bool char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero(vector unsigned char __a) {
+  return __builtin_s390_vistrb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero(vector signed short __a) {
+  return (vector signed short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero(vector bool short __a) {
+  return (vector bool short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero(vector unsigned short __a) {
+  return __builtin_s390_vistrh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero(vector signed int __a) {
+  return (vector signed int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero(vector bool int __a) {
+  return (vector bool int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero(vector unsigned int __a) {
+  return __builtin_s390_vistrf(__a);
+}
+
+/*-- vec_cp_until_zero_cc ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero_cc(vector signed char __a, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero_cc(vector bool char __a, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero_cc(vector unsigned char __a, int *__cc) {
+  return __builtin_s390_vistrbs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero_cc(vector signed short __a, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero_cc(vector bool short __a, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero_cc(vector unsigned short __a, int *__cc) {
+  return __builtin_s390_vistrhs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero_cc(vector signed int __a, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vistrfs((vector unsigned int)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero_cc(vector bool int __a, int *__cc) {
+  return (vector bool int)__builtin_s390_vistrfs((vector unsigned int)__a,
+                                                 __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero_cc(vector unsigned int __a, int *__cc) {
+  return __builtin_s390_vistrfs(__a, __cc);
+}
+
+/*-- vec_cmpeq_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeeb((vector unsigned char)__a,
+                         (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeeb((vector unsigned char)__a,
+                              (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeeh((vector unsigned short)__a,
+                         (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeeh((vector unsigned short)__a,
+                              (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeef((vector unsigned int)__a,
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeef((vector unsigned int)__a,
+                              (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeef(__a, __b);
+}
+
+/*-- vec_cmpeq_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfeebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfeehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfeefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpeq_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeezb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeezb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeezh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeezh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeezf((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeezf((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeezf(__a, __b);
+}
+
+/*-- vec_cmpeq_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeneb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeneb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeneb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeneh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeneh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeneh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenef((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenef((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenef(__a, __b);
+}
+
+/*-- vec_cmpne_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenebs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenebs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfenebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenehs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenehs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfenehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenefs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenefs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfenefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfenezb((vector unsigned char)__a,
+                           (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfenezb((vector unsigned char)__a,
+                                (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfenezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfenezh((vector unsigned short)__a,
+                           (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfenezh((vector unsigned short)__a,
+                                (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfenezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenezf((vector unsigned int)__a,
+                           (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenezf((vector unsigned int)__a,
+                                (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenezf(__a, __b);
+}
+
+/*-- vec_cmpne_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenezbs((vector unsigned char)__a,
+                            (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenezbs((vector unsigned char)__a,
+                                 (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenezhs((vector unsigned short)__a,
+                            (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenezhs((vector unsigned short)__a,
+                                 (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenezfs((vector unsigned int)__a,
+                            (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenezfs((vector unsigned int)__a,
+                                 (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmprg --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 4);
+}
+
+/*-- vec_cmprg_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg_cc(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg_cc(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg_cc(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 4, __cc);
+}
+
+/*-- vec_cmprg_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmprg_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                   vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                   vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                   vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmpnrg -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg(vector unsigned char __a, vector unsigned char __b,
+           vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg(vector unsigned short __a, vector unsigned short __b,
+           vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg(vector unsigned int __a, vector unsigned int __b,
+           vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 12);
+}
+
+/*-- vec_cmpnrg_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg_cc(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg_cc(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg_cc(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 12, __cc);
+}
+
+/*-- vec_cmpnrg_idx ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx(vector unsigned short __a, vector unsigned short __b,
+               vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx(vector unsigned int __a, vector unsigned int __b,
+               vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_idx_cc ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                  vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                  vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                  vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_cmpnrg_or_0_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                    vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                    vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                    vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_or_0_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_find_any_eq --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 4);
+}
+
+/*-- vec_find_any_eq_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 4, __cc);
+}
+
+/*-- vec_find_any_eq_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_eq_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_ne --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 12);
+}
+
+/*-- vec_find_any_ne_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 12, __cc);
+}
+
+/*-- vec_find_any_ne_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 8, __cc);
+}
+
+/*-- vec_find_any_ne_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 8, __cc);
+}
+
+#undef __constant_pow2_range
+#undef __constant_range
+#undef __constant
+#undef __ATTRS_o
+#undef __ATTRS_o_ai
+#undef __ATTRS_ai
+
+#else
+
+#error "Use -fzvector to enable vector extensions"
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/wmmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/wmmintrin.h
new file mode 100644
index 000000000..a2d931010
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/wmmintrin.h
@@ -0,0 +1,33 @@
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WMMINTRIN_H
+#define _WMMINTRIN_H
+
+#include <emmintrin.h>
+
+#include <__wmmintrin_aes.h>
+
+#include <__wmmintrin_pclmul.h>
+
+#endif /* _WMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/x86intrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/x86intrin.h
new file mode 100644
index 000000000..81a404f55
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/x86intrin.h
@@ -0,0 +1,85 @@
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__3dNOW__)
+#include <mm3dnow.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+#include <bmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
+#include <bmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
+#include <lzcntintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
+#include <popcntintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
+#include <rdseedintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
+#include <prfchwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE4A__)
+#include <ammintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA4__)
+#include <fma4intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XOP__)
+#include <xopintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__TBM__)
+#include <tbmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
+#include <f16cintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
+#include <mwaitxintrin.h>
+#endif
+
+/* FIXME: LWP */
+
+#endif /* __X86INTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xmmintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xmmintrin.h
new file mode 100644
index 000000000..dc31b85cf
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xmmintrin.h
@@ -0,0 +1,2972 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+/* Unsigned types */
+typedef unsigned int __v4su __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
+
+/// \brief Adds the 32-bit float values in the low-order bits of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
+///    of the lower 32 bits of both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
+///    the addition.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the sums of both
+///    operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a + (__v4sf)__b);
+}
+
+/// \brief Subtracts the 32-bit float value in the low-order bits of the second
+///    operand from the corresponding value in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
+///    of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
+///    bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    difference of the lower 32 bits of both operands. The upper 96 bits are
+///    copied from the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+/// \brief Subtracts each of the values of the second operand from the first
+///    operand, both of which are 128-bit vectors of [4 x float] and returns
+///    the results of the subtraction.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the differences between
+///    both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a - (__v4sf)__b);
+}
+
+/// \brief Multiplies two 32-bit float values in the low-order bits of the
+///    operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the product of the lower
+///    32 bits of both operands. The upper 96 bits are copied from the upper 96
+///    bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
+///    results of the multiplication.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the products of both
+///    operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a * (__v4sf)__b);
+}
+
+/// \brief Divides the value in the low-order 32 bits of the first operand by
+///    the corresponding value in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
+///    bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
+///    of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of the
+///    lower 32 bits of both operands. The upper 96 bits are copied from the
+///    upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+/// \brief Divides two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of both
+///    operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a / (__v4sf)__b);
+}
+
+/// \brief Calculates the square root of the value stored in the low-order bits
+///    of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the square root of the
+///    value in the low-order bits of the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+/// \brief Calculates the square roots of the values stored in a 128-bit vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the square roots of the
+///    values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_sqrtps((__v4sf)__a);
+}
+
+/// \brief Calculates the approximate reciprocal of the value stored in the
+///    low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocal of the value in the low-order bits of the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+/// \brief Calculates the approximate reciprocals of the values stored in a
+///    128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocals of the values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ps(__m128 __a)
+{
+  return __builtin_ia32_rcpps((__v4sf)__a);
+}
+
+/// \brief Calculates the approximate reciprocal of the square root of the value
+///    stored in the low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocal of the square root of the value in the low-order bits of the
+///    operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+/// \brief Calculates the approximate reciprocals of the square roots of the
+///    values stored in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocals of the square roots of the values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_rsqrtps((__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands and returns the lesser value in the low-order bits of the
+///    vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    minimum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the minimum values
+///    between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands and returns the greater value in the low-order bits of a 128-bit
+///    vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    maximum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the maximum values
+///    between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector containing one of the source operands.
+/// \param __b
+///    A 128-bit vector containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+///    values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4su)__a & (__v4su)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
+///    the one's complement of the values contained in the first source
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the first source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the second source operand.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+///    one's complement of the first operand and the values in the second
+///    operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)(~(__v4su)__a & (__v4su)__b);
+}
+
+/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
+///    values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4su)__a | (__v4su)__b);
+}
+
+/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
+///    of the values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4su)__a ^ (__v4su)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for equality and returns the result of the comparison in the
+///    low-order bits of a vector [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is less than the
+///    corresponding value in the second operand and returns the result of the
+///    comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is less than or
+///    equal to the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is greater than
+///    the corresponding value in the second operand and returns the result of
+///    the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is greater than
+///    or equal to the corresponding value in the second operand and returns
+///    the result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for inequality and returns the result of the comparison in the
+///    low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] for inequality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not less than
+///    the corresponding value in the second operand and returns the result of
+///    the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not less than
+///    or equal to the corresponding value in the second operand and returns
+///    the result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not greater
+///    than the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not greater
+///    than or equal to the corresponding value in the second operand and
+///    returns the result of the comparison in the low-order bits of a vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is ordered with
+///    respect to the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is unordered
+///    with respect to the corresponding value in the second operand and
+///    returns the result of the comparison in the low-order bits of a vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for equality and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is less than the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is less than or equal to the
+///    second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is greater than the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is greater than or equal to
+///    the second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is not equal to the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine equality and returns
+///    the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    less than the second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    less than or equal to the second operand and returns the result of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    greater than the second operand and returns the result of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    greater than or equal to the second operand and returns the result of
+///    the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine inequality and returns
+///    the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si((__v4sf)__a);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvt_ss2si(__m128 __a)
+{
+  return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si64((__v4sf)__a);
+}
+
+#endif
+
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+}
+
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvt_ps2pi(__m128 __a)
+{
+  return _mm_cvtps_pi32(__a);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvttss2si((__v4sf)__a);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtt_ss2si(__m128 __a)
+{
+  return _mm_cvttss_si32(__a);
+}
+
+#ifdef __x86_64__
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvttss2si64((__v4sf)__a);
+}
+#endif
+
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
+///    when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+}
+
+/// \brief Converts two low-order float values in a 128-bit vector of [4 x
+///    float] into a 64-bit vector of [2 x i32], truncating the result when it
+///    is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtt_ps2pi(__m128 __a)
+{
+  return _mm_cvttps_pi32(__a);
+}
+
+/// \brief Converts a 32-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination vector are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+/// \brief Converts a 32-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+  return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+/// \brief Converts a 64-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+#endif
+
+/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+///    floating point values and writes them to the lower 64-bits of the
+///    destination. The remaining higher order elements of the destination are
+///    copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
+///    and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied from
+///    the upper 64 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+}
+
+/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+///    floating point values and writes them to the lower 64-bits of the
+///    destination. The remaining higher order elements of the destination are
+///    copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
+///    and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value from the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+  return _mm_cvtpi32_ps(__a, __b);
+}
+
+/// \brief Extracts a float value contained in the lower 32 bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the extraction.
+/// \returns A 32-bit float containing the extracted value.
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm_cvtss_f32(__m128 __a)
+{
+  return __a[0];
+}
+
+/// \brief Loads two packed float values from the address \a __p into the
+///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
+///     are copied from the low-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
+///    of the destination.
+/// \param __p
+///    A pointer to two packed float values. Bits [63:0] are written to bits
+///    [127:64] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+/// \brief Loads two packed float values from the address \a __p into the
+///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
+///    are copied from the high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
+///    [127:64] of the destination.
+/// \param __p
+///    A pointer to two packed float values. Bits [63:0] are written to bits
+///    [63:0] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits of the vector are initialized with the single-precision
+///    floating-point value loaded from a specified memory location. The upper
+///    96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location containing a single-precision
+///    floating-point value.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+///    lower 32 bits contain the value loaded from the memory location. The
+///    upper 96 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ss(const float *__p)
+{
+  struct __mm_load_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  return (__m128){ __u, 0, 0, 0 };
+}
+
+/// \brief Loads a 32-bit float value and duplicates it to all four vector
+///    elements of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a float value to be loaded and duplicated.
+/// \returns A 128-bit vector of [4 x float] containing the loaded and
+///    duplicated values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load1_ps(const float *__p)
+{
+  struct __mm_load1_ps_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  return (__m128){ __u, __u, __u, __u };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ps(const float *__p)
+{
+  return *(__m128*)__p;
+}
+
+/// \brief Loads a 128-bit floating-point vector of [4 x float] from an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadu_ps(const float *__p)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+/// \brief Loads four packed float values, in reverse order, from an aligned
+///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
+///    in reverse order.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadr_ps(const float *__p)
+{
+  __m128 __a = _mm_load_ps(__p);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
+}
+
+/// \brief Create a 128-bit vector of [4 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x float] containing undefined values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_undefined_ps(void)
+{
+  return (__m128)__builtin_ia32_undef128();
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits of the vector are initialized with the specified single-precision
+///    floating-point value. The upper 96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize the lower 32
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+///    lower 32 bits contain the value provided in the source operand. The
+///    upper 96 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ss(float __w)
+{
+  return (__m128){ __w, 0, 0, 0 };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+///    of the four single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set1_ps(float __w)
+{
+  return (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+///    of the four single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps1(float __w)
+{
+    return _mm_set1_ps(__w);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]
+///    initialized with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __z
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __y
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __x
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __w
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __w, __x, __y, __z };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float],
+///    initialized in reverse order with the specified 32-bit single-precision
+///    float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __z
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \param __y
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __x
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __w
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __z, __y, __x, __w };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [4 x float] with
+///    all elements set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
+///     memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
+}
+
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
+///     memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ss(float *__p, __m128 __a)
+{
+  struct __mm_store_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+/// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+  struct __storeu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ps*)__p)->__v = __a;
+}
+
+/// \brief Stores a 128-bit vector of [4 x float] into an aligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps(float *__p, __m128 __a)
+{
+  *(__m128*)__p = __a;
+}
+
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by \a __p.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
+  _mm_store_ps(__p, __a);
+}
+
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by \a __p.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps1(float *__p, __m128 __a)
+{
+  return _mm_store1_ps(__p, __a);
+}
+
+/// \brief Stores float values from a 128-bit vector of [4 x float] to an
+///    aligned memory location in reverse order.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
+  _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+   Sema doesn't do any form of constant propagation yet. */
+
+/// \brief Loads one cache line of data from the specified address to a location
+///    closer to the processor.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_prefetch(const void * a, const int sel);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
+///
+/// \param a
+///    A pointer to a memory location containing a cache line of data.
+/// \param sel
+///    A predefined integer constant specifying the type of prefetch
+///    operation: \n
+///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
+///    PREFETCHNTA instruction will be generated. \n
+///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
+///    be generated. \n
+///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
+///    be generated. \n
+///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
+///    be generated.                                   
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#endif
+
+/// \brief Stores a 64-bit integer in the specified aligned memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location used to store the register value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+  __builtin_ia32_movntq(__p, __a);
+}
+
+/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
+///    128-bit aligned memory location. To minimize caching, the data is flagged
+///    as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit aligned memory location that will receive the
+///    integer values.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ps(float *__p, __m128 __a)
+{
+  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief Forces strong memory ordering (serialization) between store
+///    instructions preceding this instruction and store instructions following
+///    this instruction, ensuring the system completes all previous stores
+///    before executing subsequent stores.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
+///
+void _mm_sfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+///    returns it, as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_extract_pi(__m64 a, int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param a
+///    A 64-bit vector of [4 x i16].
+/// \param n
+///    An immediate integer operand that determines which bits are extracted: \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
+///    3: Bits [63:48] are copied to the destination.
+/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
+#define _mm_extract_pi16(a, n) __extension__ ({ \
+  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
+
+/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
+///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
+///    specified by the immediate operand \a n.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_insert_pi(__m64 a, int d, int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param a
+///    A 64-bit vector of [4 x i16].
+/// \param d
+///    An integer. The lower 16-bit value from this operand is written to the
+///    destination at the offset specified by operand \a n.
+/// \param n
+///    An immediate integer operant that determines which the bits to be used
+///    in the destination. \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
+///    3: Bits [63:48] are copied to the destination.  \n
+///    The remaining bits in the destination are copied from the corresponding
+///    bits in operand \a a.
+/// \returns A 64-bit integer vector containing the copied packed data from the
+///    operands.
+#define _mm_insert_pi16(a, d, n) __extension__ ({ \
+  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
+
+/// \brief Compares each of the corresponding packed 16-bit integer values of
+///    the 64-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Compares each of the corresponding packed 16-bit integer values of
+///    the 64-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Takes the most significant bit from each 8-bit element in a 64-bit
+///    integer vector to create a 16-bit mask value. Zero-extends the value to
+///    32-bit integer and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bit from each 8-bit element in the operand,
+///    written to bits [15:0].
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pi8(__m64 __a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+/// \brief Multiplies packed 16-bit unsigned integer values and writes the
+///    high-order 16 bits of each 32-bit product to the corresponding bits in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the products of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+///    destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
+///
+/// \param a
+///    A 64-bit integer vector containing the values to be shuffled.
+/// \param n
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from \a a. The destinations within the 64-bit destination are
+///    assigned values as follows: \n
+///    Bits [1:0] are used to assign values to bits [15:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [31:16] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [47:32] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [63:48] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: assigned from bits [15:0] of \a a. \n
+///    01: assigned from bits [31:16] of \a a. \n
+///    10: assigned from bits [47:32] of \a a. \n
+///    11: assigned from bits [63:48] of \a a.
+/// \returns A 64-bit integer vector containing the shuffled values.
+#define _mm_shuffle_pi16(a, n) __extension__ ({ \
+  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
+
+/// \brief Conditionally copies the values from each 8-bit element in the first
+///    64-bit integer vector operand to the specified memory location, as
+///    specified by the most significant bit in the corresponding element in the
+///    second 64-bit integer vector operand. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
+///
+/// \param __d
+///    A 64-bit integer vector containing the values with elements to be copied.
+/// \param __n
+///    A 64-bit integer vector operand. The most significant bit from each 8-bit
+///    element determines whether the corresponding element in operand \a __d
+///    is copied. If the most significant bit of a given element is 1, the
+///    corresponding element in operand \a __d is copied.
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the conditionally
+///    copied integer values. The address of the memory location does not have
+///    to be aligned.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
+///    64-bit vector operands and computes the absolute value for each of the
+///    difference. Then sum of the 8 absolute differences is written to the
+///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
+///    sets of absolute differences between both operands. The upper bits are
+///    cleared.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
+///    integer value.
+///
+///    There are several groups of macros associated with this
+///    intrinsic, including:
+///    <ul>
+///    <li>
+///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
+///      _MM_GET_EXCEPTION_STATE().
+///    </li>
+///    <li>
+///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
+///    </li>            
+///    <li>
+///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
+///    </li>
+///    <li> 
+///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
+///    </li>
+///    <li> 
+///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+///      _MM_GET_DENORMALS_ZERO_MODE().
+///    </li>
+///    </ul>
+///
+///    For example, the expression below checks if an overflow exception has
+///    occurred:
+///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
+///
+///    The following example gets the current rounding mode:
+///      _MM_GET_ROUNDING_MODE()
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
+///
+/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
+///    register.
+unsigned int _mm_getcsr(void);
+
+/// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
+///   
+///    There are several groups of macros associated with this intrinsic,
+///    including:
+///    <ul>
+///    <li> 
+///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
+///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
+///    </li>
+///    <li>
+///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
+///      of these macros.
+///    </li>
+///    <li>
+///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
+///    </li>
+///    <li>
+///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
+///      one of these macros.
+///    </li>
+///    <li>
+///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
+///    </li>
+///    </ul>
+///
+///    For example, the following expression causes subsequent floating-point
+///    operations to round up:
+///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
+///
+///    The following example sets the DAZ and FTZ flags:
+///      void setFlags() {
+///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
+///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
+///      }
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
+///
+/// \param __i
+///    A 32-bit unsigned integer value to be written to the MXCSR register.
+void _mm_setcsr(unsigned int);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
+///    specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param mask
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from \ a and \a b. \n
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
+///    The destinations within the 128-bit destination are assigned values as
+///    follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [63:32] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [95:64] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [127:96] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] copied from the specified operand. \n
+///    01: Bits [63:32] copied from the specified operand. \n
+///    10: Bits [95:64] copied from the specified operand. \n
+///    11: Bits [127:96] copied from the specified operand.
+/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
+#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                  0 + (((mask) >> 0) & 0x3), \
+                                  0 + (((mask) >> 2) & 0x3), \
+                                  4 + (((mask) >> 4) & 0x3), \
+                                  4 + (((mask) >> 6) & 0x3)); })
+
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float].
+///    Bits [95:64] are written to bits [63:32] of the destination. \n
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
+}
+
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination.  \n
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the destination. \n
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits are set to the lower 32 bits of the second parameter. The upper
+///    96 bits are set to the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
+///    written to the upper 96 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
+///    written to the lower 32 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    64 bits are set to the upper 64 bits of the second parameter. The upper
+///    64 bits are set to the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+///    written to the upper 64 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+///    written to the lower 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    64 bits are set to the lower 64 bits of the first parameter. The upper
+///    64 bits are set to the lower 64 bits of the second parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+///    written to the upper 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
+}
+
+/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
+///    from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi16(__b, __a);
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
+///    128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
+///    destination are copied from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
+///    into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
+///    from the corresponding lower 4 elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi8(__b, __a);
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
+///    vector of [8 x u8] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
+///    destination are copied from the corresponding lower 4 elements in this
+///    operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+/// \brief Converts the two 32-bit signed integer values from each 64-bit vector
+///    operand of [2 x i32] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
+///    copied from the elements in this operand.
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
+///    copied from the elements in this operand.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    copied and converted values from the first operand. The upper 64 bits
+///    contain the copied and converted values from the second operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+  __m128 __c;
+
+  __c = _mm_setzero_ps();
+  __c = _mm_cvtpi32_ps(__c, __b);
+  __c = _mm_movelh_ps(__c, __c);
+
+  return _mm_cvtpi32_ps(__c, __a);
+}
+
+/// \brief Converts each single-precision floating-point element of a 128-bit
+///    floating-point vector of [4 x float] into a 16-bit signed integer, and
+///    packs the results into a 64-bit integer vector of [4 x i16]. If the
+///    floating-point element is NaN or infinity, or if the floating-point
+///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
+///    to 0x8000. Otherwise if the floating-point element is greater than
+///    0x7FFF, it is converted to 0x7FFF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi16(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi32(__a);
+  __a = _mm_movehl_ps(__a, __a);
+  __c = _mm_cvtps_pi32(__a);
+
+  return _mm_packs_pi32(__b, __c);
+}
+
+/// \brief Converts each single-precision floating-point element of a 128-bit
+///    floating-point vector of [4 x float] into an 8-bit signed integer, and
+///    packs the results into the lower 32 bits of a 64-bit integer vector of
+///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
+///    floating-point element is NaN or infinity, or if the floating-point
+///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
+///    to 0x80. Otherwise if the floating-point element is greater than 0x7F,
+///    it is converted to 0x7F.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
+///    converted values and the uppper 32 bits are set to zero.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi8(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi16(__a);
+  __c = _mm_setzero_si64();
+
+  return _mm_packs_pi16(__b, __c);
+}
+
+/// \brief Extracts the sign bits from each single-precision floating-point
+///    element of a 128-bit floating-point vector of [4 x float] and returns the
+///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
+///    single-precision floating-point element of the parameter. Bits [31:4] are
+///    set to zero.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_ps(__m128 __a)
+{
+  return __builtin_ia32_movmskps((__v4sf)__a);
+}
+
+
+#define _MM_ALIGN16 __attribute__((aligned(16)))
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x0000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
+#include <emmintrin.h>
+#endif
+
+#endif /* __XMMINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xopintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xopintrin.h
new file mode 100644
index 000000000..bdf0cec32
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xopintrin.h
@@ -0,0 +1,782 @@
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#include <fma4intrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi16(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi32(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi64(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+                                  (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+                                  (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+                                  (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epu64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+                                  (__v2di)(__m128i)(B), (N)); })
+
+#define _mm_com_epi8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+                                 (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+                                 (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+                                 (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epi64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+                                 (__v2di)(__m128i)(B), (N)); })
+
+#define _MM_PCOMCTRL_LT    0
+#define _MM_PCOMCTRL_LE    1
+#define _MM_PCOMCTRL_GT    2
+#define _MM_PCOMCTRL_GE    3
+#define _MM_PCOMCTRL_EQ    4
+#define _MM_PCOMCTRL_NEQ   5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE  7
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+                                     (__v2df)(__m128d)(Y), \
+                                     (__v2di)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+                                        (__v4df)(__m256d)(Y), \
+                                        (__v4di)(__m256i)(C), (I)); })
+
+#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+                                    (__v4si)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+                                       (__v8sf)(__m256)(Y), \
+                                       (__v8si)(__m256i)(C), (I)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ss(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_sd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ps(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_pd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_frcz_ps(__m256 __A)
+{
+  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_frcz_pd(__m256d __A)
+{
+  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __XOPINTRIN_H */
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsavecintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsavecintrin.h
new file mode 100644
index 000000000..598470a68
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsavecintrin.h
@@ -0,0 +1,48 @@
+/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVECINTRIN_H
+#define __XSAVECINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsavec")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsaveintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsaveintrin.h
new file mode 100644
index 000000000..a2e6b2e74
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsaveintrin.h
@@ -0,0 +1,58 @@
+/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEINTRIN_H
+#define __XSAVEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsave")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsaveoptintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsaveoptintrin.h
new file mode 100644
index 000000000..d3faae78b
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsaveoptintrin.h
@@ -0,0 +1,48 @@
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEOPTINTRIN_H
+#define __XSAVEOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaveopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsavesintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsavesintrin.h
new file mode 100644
index 000000000..c5e540a86
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xsavesintrin.h
@@ -0,0 +1,58 @@
+/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVESINTRIN_H
+#define __XSAVESINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaves")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xtestintrin.h b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xtestintrin.h
new file mode 100644
index 000000000..9d3378fd1
--- /dev/null
+++ b/externals/clang/4.0.0/linux32/lib/clang/4.0.0/include/xtestintrin.h
@@ -0,0 +1,41 @@
+/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XTESTINTRIN_H
+#define __XTESTINTRIN_H
+
+/* xtest returns non-zero if the instruction is executed within an RTM or active
+ * HLE region. */
+/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
+ * supported. */
+static __inline__ int
+    __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+    _xtest(void) {
+  return __builtin_ia32_xtest();
+}
+
+#endif
diff --git a/externals/clang/4.0.0/osx/bin/clang b/externals/clang/4.0.0/osx/bin/clang
new file mode 100755
index 000000000..055dabb3a
Binary files /dev/null and b/externals/clang/4.0.0/osx/bin/clang differ
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_builtin_vars.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_builtin_vars.h
new file mode 100644
index 000000000..6f5eb9c78
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_builtin_vars.h
@@ -0,0 +1,126 @@
+/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_BUILTIN_VARS_H
+#define __CUDA_BUILTIN_VARS_H
+
+// Forward declares from vector_types.h.
+struct uint3;
+struct dim3;
+
+// The file implements built-in CUDA variables using __declspec(property).
+// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
+// All read accesses of built-in variable fields get converted into calls to a
+// getter function which in turn calls the appropriate builtin to fetch the
+// value.
+//
+// Example:
+//    int x = threadIdx.x;
+// IR output:
+//  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
+// PTX output:
+//  mov.u32     %r2, %tid.x;
+
+#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC)                                \
+  __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD;      \
+  static inline __attribute__((always_inline))                                 \
+      __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) {     \
+    return INTRINSIC;                                                          \
+  }
+
+#if __cplusplus >= 201103L
+#define __DELETE =delete
+#else
+#define __DELETE
+#endif
+
+// Make sure nobody can create instances of the special varible types.  nvcc
+// also disallows taking address of special variables, so we disable address-of
+// operator as well.
+#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
+  __attribute__((device)) TypeName() __DELETE;                                 \
+  __attribute__((device)) TypeName(const TypeName &) __DELETE;                 \
+  __attribute__((device)) void operator=(const TypeName &) const __DELETE;     \
+  __attribute__((device)) TypeName *operator&() const __DELETE
+
+struct __cuda_builtin_threadIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
+  // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
+};
+
+struct __cuda_builtin_blockIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
+  // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
+};
+
+struct __cuda_builtin_blockDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
+  // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
+};
+
+struct __cuda_builtin_gridDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
+  // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
+};
+
+#define __CUDA_BUILTIN_VAR                                                     \
+  extern const __attribute__((device)) __attribute__((weak))
+__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
+__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
+
+// warpSize should translate to read of %WARP_SZ but there's currently no
+// builtin to do so. According to PTX v4.2 docs 'to date, all target
+// architectures have a WARP_SZ value of 32'.
+__attribute__((device)) const int warpSize = 32;
+
+#undef __CUDA_DEVICE_BUILTIN
+#undef __CUDA_BUILTIN_VAR
+#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+
+#endif /* __CUDA_BUILTIN_VARS_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_cmath.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_cmath.h
new file mode 100644
index 000000000..9bef82611
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_cmath.h
@@ -0,0 +1,487 @@
+/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_CMATH_H__
+#define __CLANG_CUDA_CMATH_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+#include <limits>
+
+// CUDA lets us use various std math functions on the device side.  This file
+// works in concert with __clang_cuda_math_forward_declares.h to make this work.
+//
+// Specifically, the forward-declares header declares __device__ overloads for
+// these functions in the global namespace, then pulls them into namespace std
+// with 'using' statements.  Then this file implements those functions, after
+// their implementations have been pulled in.
+//
+// It's important that we declare the functions in the global namespace and pull
+// them into namespace std with using statements, as opposed to simply declaring
+// these functions in namespace std, because our device functions need to
+// overload the standard library functions, which may be declared in the global
+// namespace or in std, depending on the degree of conformance of the stdlib
+// implementation.  Declaring in the global namespace and pulling into namespace
+// std covers all of the known knowns.
+
+#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
+
+__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ float acos(float __x) { return ::acosf(__x); }
+__DEVICE__ float asin(float __x) { return ::asinf(__x); }
+__DEVICE__ float atan(float __x) { return ::atanf(__x); }
+__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
+__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
+__DEVICE__ float cos(float __x) { return ::cosf(__x); }
+__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
+__DEVICE__ float exp(float __x) { return ::expf(__x); }
+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
+__DEVICE__ float floor(float __x) { return ::floorf(__x); }
+__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
+__DEVICE__ int fpclassify(float __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ int fpclassify(double __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ float frexp(float __arg, int *__exp) {
+  return ::frexpf(__arg, __exp);
+}
+
+// For inscrutable reasons, the CUDA headers define these functions for us on
+// Windows.
+#ifndef _MSC_VER
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
+// For inscrutable reasons, __finite(), the double-precision version of
+// __finitef, does not exist when compiling for MacOS.  __isfinited is available
+// everywhere and is just as good.
+__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+#endif
+
+__DEVICE__ bool isgreater(float __x, float __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreater(double __x, double __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(float __x, float __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(double __x, double __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isless(float __x, float __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool isless(double __x, double __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool islessequal(float __x, float __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessequal(double __x, double __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessgreater(float __x, float __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool islessgreater(double __x, double __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isunordered(float __x, float __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ bool isunordered(double __x, double __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ float ldexp(float __arg, int __exp) {
+  return ::ldexpf(__arg, __exp);
+}
+__DEVICE__ float log(float __x) { return ::logf(__x); }
+__DEVICE__ float log10(float __x) { return ::log10f(__x); }
+__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
+__DEVICE__ float nexttoward(float __from, double __to) {
+  return __builtin_nexttowardf(__from, __to);
+}
+__DEVICE__ double nexttoward(double __from, double __to) {
+  return __builtin_nexttoward(__from, __to);
+}
+__DEVICE__ float nexttowardf(float __from, double __to) {
+  return __builtin_nexttowardf(__from, __to);
+}
+__DEVICE__ float pow(float __base, float __exp) {
+  return ::powf(__base, __exp);
+}
+__DEVICE__ float pow(float __base, int __iexp) {
+  return ::powif(__base, __iexp);
+}
+__DEVICE__ double pow(double __base, int __iexp) {
+  return ::powi(__base, __iexp);
+}
+__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
+__DEVICE__ float sin(float __x) { return ::sinf(__x); }
+__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
+__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
+__DEVICE__ float tan(float __x) { return ::tanf(__x); }
+__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
+
+// Now we've defined everything we promised we'd define in
+// __clang_cuda_math_forward_declares.h.  We need to do two additional things to
+// fix up our math functions.
+//
+// 1) Define __device__ overloads for e.g. sin(int).  The CUDA headers define
+//    only sin(float) and sin(double), which means that e.g. sin(0) is
+//    ambiguous.
+//
+// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
+//    std.  These are defined in the CUDA headers in the global namespace,
+//    independent of everything else we've done here.
+
+// We can't use std::enable_if, because we want to be pre-C++11 compatible.  But
+// we go ahead and unconditionally define functions that are only available when
+// compiling for C++11 to match the behavior of the CUDA headers.
+template<bool __B, class __T = void>
+struct __clang_cuda_enable_if {};
+
+template <class __T> struct __clang_cuda_enable_if<true, __T> {
+  typedef __T type;
+};
+
+// Defines an overload of __fn that accepts one integral argument, calls
+// __fn((double)x), and returns __retty.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn)                      \
+  template <typename __T>                                                      \
+  __DEVICE__                                                                   \
+      typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+      __fn(__T __x) {                                                          \
+    return ::__fn((double)__x);                                                \
+  }
+
+// Defines an overload of __fn that accepts one two arithmetic arguments, calls
+// __fn((double)x, (double)y), and returns a double.
+//
+// Note this is different from OVERLOAD_1, which generates an overload that
+// accepts only *integral* arguments.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn)                      \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __clang_cuda_enable_if<                                  \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      __retty>::type                                                           \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
+
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
+
+// Overloads for functions that don't match the patterns expected by
+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized &&
+        std::numeric_limits<__T3>::is_specialized,
+    double>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+  return std::fma((double)__x, (double)__y, (double)__z);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+frexp(__T __x, int *__exp) {
+  return std::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+ldexp(__T __x, int __exp) {
+  return std::ldexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+nexttoward(__T __from, double __to) {
+  return std::nexttoward((double)__from, __to);
+}
+
+template <typename __T1, typename __T2>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized,
+    double>::type
+remquo(__T1 __x, __T2 __y, int *__quo) {
+  return std::remquo((double)__x, (double)__y, __quo);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbln(__T __x, long __exp) {
+  return std::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbn(__T __x, int __exp) {
+  return std::scalbn((double)__x, __exp);
+}
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that CUDA defines in its headers into
+// namespace std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+using ::nexttowardf;
+using ::nexttowardf;
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#undef __DEVICE__
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_complex_builtins.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_complex_builtins.h
new file mode 100644
index 000000000..beef7deff
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_complex_builtins.h
@@ -0,0 +1,203 @@
+/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
+#define __CLANG_CUDA_COMPLEX_BUILTINS
+
+// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
+// libgcc functions that clang assumes are available when compiling c99 complex
+// operations.  (These implementations come from libc++, and have been modified
+// to work with CUDA.)
+
+extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
+                                                      double __c, double __d) {
+  double __ac = __a * __c;
+  double __bd = __b * __d;
+  double __ad = __a * __d;
+  double __bc = __b * __c;
+  double _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
+      // a device overload (and isn't constexpr before C++11, naturally).
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  float __ac = __a * __c;
+  float __bd = __b * __d;
+  float __ad = __a * __d;
+  float __bc = __b * __c;
+  float _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
+                                                      double __c, double __d) {
+  int __ilogbw = 0;
+  // Can't use std::max, because that's defined in <algorithm>, and we don't
+  // want to pull that in for every compile.  The CUDA headers define
+  // ::max(float, float) and ::max(double, double), which is sufficient for us.
+  double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  double __denom = __c * __c + __d * __d;
+  double _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
+      __real__(z) = 0.0 * (__a * __c + __b * __d);
+      __imag__(z) = 0.0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  int __ilogbw = 0;
+  float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  float __denom = __c * __c + __d * __d;
+  float _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      __real__(z) = 0 * (__a * __c + __b * __d);
+      __imag__(z) = 0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+#endif // __CLANG_CUDA_COMPLEX_BUILTINS
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_intrinsics.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_intrinsics.h
new file mode 100644
index 000000000..b43ce21d0
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_intrinsics.h
@@ -0,0 +1,322 @@
+/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_INTRINSICS_H__
+#define __CLANG_CUDA_INTRINSICS_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// sm_30 intrinsics: __shfl_{up,down,xor}.
+
+#define __SM_30_INTRINSICS_H__
+#define __SM_30_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#pragma push_macro("__MAKE_SHUFFLES")
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask)    \
+  inline __device__ int __FnName(int __val, int __offset,                      \
+                                 int __width = warpSize) {                     \
+    return __IntIntrinsic(__val, __offset,                                     \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(float __val, int __offset,                  \
+                                   int __width = warpSize) {                   \
+    return __FloatIntrinsic(__val, __offset,                                   \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __val, int __offset,    \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(static_cast<int>(__val), __offset, __width));               \
+  }                                                                            \
+  inline __device__ long long __FnName(long long __val, int __offset,          \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
+    __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned long long __val, int __offset, int __width = warpSize) {        \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        static_cast<unsigned long long>(__val), __offset, __width));           \
+  }                                                                            \
+  inline __device__ double __FnName(double __val, int __offset,                \
+                                    int __width = warpSize) {                  \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp = ::__FnName(__tmp, __offset, __width);                              \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
+  }
+
+__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0);
+__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f);
+__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
+
+#pragma pop_macro("__MAKE_SHUFFLES")
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
+
+// Prevent the vanilla sm_32 intrinsics header from being included.
+#define __SM_32_INTRINSICS_H__
+#define __SM_32_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
+inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
+inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
+inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
+inline __device__ long long __ldg(const long long *ptr) {
+  return __nvvm_ldg_ll(ptr);
+}
+inline __device__ unsigned char __ldg(const unsigned char *ptr) {
+  return __nvvm_ldg_uc(ptr);
+}
+inline __device__ unsigned short __ldg(const unsigned short *ptr) {
+  return __nvvm_ldg_us(ptr);
+}
+inline __device__ unsigned int __ldg(const unsigned int *ptr) {
+  return __nvvm_ldg_ui(ptr);
+}
+inline __device__ unsigned long __ldg(const unsigned long *ptr) {
+  return __nvvm_ldg_ul(ptr);
+}
+inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
+  return __nvvm_ldg_ull(ptr);
+}
+inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
+inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
+
+inline __device__ char2 __ldg(const char2 *ptr) {
+  typedef char c2 __attribute__((ext_vector_type(2)));
+  // We can assume that ptr is aligned at least to char2's alignment, but the
+  // load will assume that ptr is aligned to char2's alignment.  This is only
+  // safe if alignof(c2) <= alignof(char2).
+  c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
+  char2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ char4 __ldg(const char4 *ptr) {
+  typedef char c4 __attribute__((ext_vector_type(4)));
+  c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
+  char4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ short2 __ldg(const short2 *ptr) {
+  typedef short s2 __attribute__((ext_vector_type(2)));
+  s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
+  short2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ short4 __ldg(const short4 *ptr) {
+  typedef short s4 __attribute__((ext_vector_type(4)));
+  s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
+  short4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ int2 __ldg(const int2 *ptr) {
+  typedef int i2 __attribute__((ext_vector_type(2)));
+  i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
+  int2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ int4 __ldg(const int4 *ptr) {
+  typedef int i4 __attribute__((ext_vector_type(4)));
+  i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
+  int4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ longlong2 __ldg(const longlong2 *ptr) {
+  typedef long long ll2 __attribute__((ext_vector_type(2)));
+  ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
+  longlong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ uchar2 __ldg(const uchar2 *ptr) {
+  typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
+  uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
+  uchar2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uchar4 __ldg(const uchar4 *ptr) {
+  typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
+  uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
+  uchar4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ushort2 __ldg(const ushort2 *ptr) {
+  typedef unsigned short us2 __attribute__((ext_vector_type(2)));
+  us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
+  ushort2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ ushort4 __ldg(const ushort4 *ptr) {
+  typedef unsigned short us4 __attribute__((ext_vector_type(4)));
+  us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
+  ushort4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ uint2 __ldg(const uint2 *ptr) {
+  typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
+  ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
+  uint2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uint4 __ldg(const uint4 *ptr) {
+  typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
+  ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
+  uint4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
+  typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
+  ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
+  ulonglong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ float2 __ldg(const float2 *ptr) {
+  typedef float f2 __attribute__((ext_vector_type(2)));
+  f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
+  float2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ float4 __ldg(const float4 *ptr) {
+  typedef float f4 __attribute__((ext_vector_type(4)));
+  f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
+  float4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ double2 __ldg(const double2 *ptr) {
+  typedef double d2 __attribute__((ext_vector_type(2)));
+  d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
+  double2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+// TODO: Implement these as intrinsics, so the backend can work its magic on
+// these.  Alternatively, we could implement these as plain C and try to get
+// llvm to recognize the relevant patterns.
+inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned ret;
+  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(ret)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return ret;
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_math_forward_declares.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_math_forward_declares.h
new file mode 100644
index 000000000..49c805151
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_math_forward_declares.h
@@ -0,0 +1,286 @@
+/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// This file forward-declares of some math functions we (or the CUDA headers)
+// will define later.  We need to do this, and do it before cmath is included,
+// because the standard library may have constexpr math functions.  In the
+// absence of a prior __device__ decl, those constexpr functions may become
+// implicitly host+device.  host+device functions can't be overloaded, so that
+// would preclude the use of our own __device__ overloads for these functions.
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__                                                             \
+  static __inline__ __attribute__((always_inline)) __attribute__((device))
+
+__DEVICE__ double abs(double);
+__DEVICE__ float abs(float);
+__DEVICE__ int abs(int);
+__DEVICE__ long abs(long);
+__DEVICE__ long long abs(long long);
+__DEVICE__ double acos(double);
+__DEVICE__ float acos(float);
+__DEVICE__ double acosh(double);
+__DEVICE__ float acosh(float);
+__DEVICE__ double asin(double);
+__DEVICE__ float asin(float);
+__DEVICE__ double asinh(double);
+__DEVICE__ float asinh(float);
+__DEVICE__ double atan2(double, double);
+__DEVICE__ float atan2(float, float);
+__DEVICE__ double atan(double);
+__DEVICE__ float atan(float);
+__DEVICE__ double atanh(double);
+__DEVICE__ float atanh(float);
+__DEVICE__ double cbrt(double);
+__DEVICE__ float cbrt(float);
+__DEVICE__ double ceil(double);
+__DEVICE__ float ceil(float);
+__DEVICE__ double copysign(double, double);
+__DEVICE__ float copysign(float, float);
+__DEVICE__ double cos(double);
+__DEVICE__ float cos(float);
+__DEVICE__ double cosh(double);
+__DEVICE__ float cosh(float);
+__DEVICE__ double erfc(double);
+__DEVICE__ float erfc(float);
+__DEVICE__ double erf(double);
+__DEVICE__ float erf(float);
+__DEVICE__ double exp2(double);
+__DEVICE__ float exp2(float);
+__DEVICE__ double exp(double);
+__DEVICE__ float exp(float);
+__DEVICE__ double expm1(double);
+__DEVICE__ float expm1(float);
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
+__DEVICE__ double fdim(double, double);
+__DEVICE__ float fdim(float, float);
+__DEVICE__ double floor(double);
+__DEVICE__ float floor(float);
+__DEVICE__ double fma(double, double, double);
+__DEVICE__ float fma(float, float, float);
+__DEVICE__ double fmax(double, double);
+__DEVICE__ float fmax(float, float);
+__DEVICE__ double fmin(double, double);
+__DEVICE__ float fmin(float, float);
+__DEVICE__ double fmod(double, double);
+__DEVICE__ float fmod(float, float);
+__DEVICE__ int fpclassify(double);
+__DEVICE__ int fpclassify(float);
+__DEVICE__ double frexp(double, int *);
+__DEVICE__ float frexp(float, int *);
+__DEVICE__ double hypot(double, double);
+__DEVICE__ float hypot(float, float);
+__DEVICE__ int ilogb(double);
+__DEVICE__ int ilogb(float);
+__DEVICE__ bool isfinite(double);
+__DEVICE__ bool isfinite(float);
+__DEVICE__ bool isgreater(double, double);
+__DEVICE__ bool isgreaterequal(double, double);
+__DEVICE__ bool isgreaterequal(float, float);
+__DEVICE__ bool isgreater(float, float);
+__DEVICE__ bool isinf(double);
+__DEVICE__ bool isinf(float);
+__DEVICE__ bool isless(double, double);
+__DEVICE__ bool islessequal(double, double);
+__DEVICE__ bool islessequal(float, float);
+__DEVICE__ bool isless(float, float);
+__DEVICE__ bool islessgreater(double, double);
+__DEVICE__ bool islessgreater(float, float);
+__DEVICE__ bool isnan(double);
+__DEVICE__ bool isnan(float);
+__DEVICE__ bool isnormal(double);
+__DEVICE__ bool isnormal(float);
+__DEVICE__ bool isunordered(double, double);
+__DEVICE__ bool isunordered(float, float);
+__DEVICE__ long labs(long);
+__DEVICE__ double ldexp(double, int);
+__DEVICE__ float ldexp(float, int);
+__DEVICE__ double lgamma(double);
+__DEVICE__ float lgamma(float);
+__DEVICE__ long long llabs(long long);
+__DEVICE__ long long llrint(double);
+__DEVICE__ long long llrint(float);
+__DEVICE__ double log10(double);
+__DEVICE__ float log10(float);
+__DEVICE__ double log1p(double);
+__DEVICE__ float log1p(float);
+__DEVICE__ double log2(double);
+__DEVICE__ float log2(float);
+__DEVICE__ double logb(double);
+__DEVICE__ float logb(float);
+__DEVICE__ double log(double);
+__DEVICE__ float log(float);
+__DEVICE__ long lrint(double);
+__DEVICE__ long lrint(float);
+__DEVICE__ long lround(double);
+__DEVICE__ long lround(float);
+__DEVICE__ long long llround(float); // No llround(double).
+__DEVICE__ double modf(double, double *);
+__DEVICE__ float modf(float, float *);
+__DEVICE__ double nan(const char *);
+__DEVICE__ float nanf(const char *);
+__DEVICE__ double nearbyint(double);
+__DEVICE__ float nearbyint(float);
+__DEVICE__ double nextafter(double, double);
+__DEVICE__ float nextafter(float, float);
+__DEVICE__ double nexttoward(double, double);
+__DEVICE__ float nexttoward(float, double);
+__DEVICE__ float nexttowardf(float, double);
+__DEVICE__ double pow(double, double);
+__DEVICE__ double pow(double, int);
+__DEVICE__ float pow(float, float);
+__DEVICE__ float pow(float, int);
+__DEVICE__ double remainder(double, double);
+__DEVICE__ float remainder(float, float);
+__DEVICE__ double remquo(double, double, int *);
+__DEVICE__ float remquo(float, float, int *);
+__DEVICE__ double rint(double);
+__DEVICE__ float rint(float);
+__DEVICE__ double round(double);
+__DEVICE__ float round(float);
+__DEVICE__ double scalbln(double, long);
+__DEVICE__ float scalbln(float, long);
+__DEVICE__ double scalbn(double, int);
+__DEVICE__ float scalbn(float, int);
+__DEVICE__ bool signbit(double);
+__DEVICE__ bool signbit(float);
+__DEVICE__ double sin(double);
+__DEVICE__ float sin(float);
+__DEVICE__ double sinh(double);
+__DEVICE__ float sinh(float);
+__DEVICE__ double sqrt(double);
+__DEVICE__ float sqrt(float);
+__DEVICE__ double tan(double);
+__DEVICE__ float tan(float);
+__DEVICE__ double tanh(double);
+__DEVICE__ float tanh(float);
+__DEVICE__ double tgamma(double);
+__DEVICE__ float tgamma(float);
+__DEVICE__ double trunc(double);
+__DEVICE__ float trunc(float);
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+using ::abs;
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isinf;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnan;
+using ::isnormal;
+using ::isunordered;
+using ::labs;
+using ::ldexp;
+using ::lgamma;
+using ::llabs;
+using ::llrint;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::llround;
+using ::modf;
+using ::nan;
+using ::nanf;
+using ::nearbyint;
+using ::nextafter;
+using ::nexttoward;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#pragma pop_macro("__DEVICE__")
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_runtime_wrapper.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_runtime_wrapper.h
new file mode 100644
index 000000000..931d44b69
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__clang_cuda_runtime_wrapper.h
@@ -0,0 +1,347 @@
+/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ * CUDA headers are implemented in a way that currently makes it
+ * impossible for user code to #include directly when compiling with
+ * Clang. They present different view of CUDA-supplied functions
+ * depending on where in NVCC's compilation pipeline the headers are
+ * included. Neither of these modes provides function definitions with
+ * correct attributes, so we use preprocessor to force the headers
+ * into a form that Clang can use.
+ *
+ * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
+ * this file during every CUDA compilation.
+ */
+
+#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
+#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
+
+#if defined(__CUDA__) && defined(__clang__)
+
+// Include some forward declares that must come before cmath.
+#include <__clang_cuda_math_forward_declares.h>
+
+// Include some standard headers to avoid CUDA headers including them
+// while some required macros (like __THROW) are in a weird state.
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+
+// Preserve common macros that will be changed below by us or by CUDA
+// headers.
+#pragma push_macro("__THROW")
+#pragma push_macro("__CUDA_ARCH__")
+
+// WARNING: Preprocessor hacks below are based on specific details of
+// CUDA-7.x headers and are not expected to work with any other
+// version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
+#error "Unsupported CUDA version!"
+#endif
+
+// Make largest subset of device functions available during host
+// compilation -- SM_35 for the time being.
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#include "__clang_cuda_builtin_vars.h"
+
+// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
+// has taken care of builtin variables declared in the file.
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// {math,device}_functions.h only have declarations of the
+// functions. We don't need them as we're going to pull in their
+// definitions from .hpp files.
+#define __DEVICE_FUNCTIONS_H__
+#define __MATH_FUNCTIONS_H__
+#define __COMMON_FUNCTIONS_H__
+
+#undef __CUDACC__
+#define __CUDABE__
+// Disables definitions of device-side runtime support stubs in
+// cuda_device_runtime_api.h
+#include "driver_types.h"
+#include "host_config.h"
+#include "host_defines.h"
+
+#undef __CUDABE__
+#define __CUDACC__
+#include "cuda_runtime.h"
+
+#undef __CUDACC__
+#define __CUDABE__
+
+// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
+// not have at the moment. Emulate them with a builtin memcpy/memset.
+#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
+#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
+
+#include "crt/device_runtime.h"
+#include "crt/host_runtime.h"
+// device_runtime.h defines __cxa_* macros that will conflict with
+// cxxabi.h.
+// FIXME: redefine these as __device__ functions.
+#undef __cxa_vec_ctor
+#undef __cxa_vec_cctor
+#undef __cxa_vec_dtor
+#undef __cxa_vec_new
+#undef __cxa_vec_new2
+#undef __cxa_vec_new3
+#undef __cxa_vec_delete2
+#undef __cxa_vec_delete
+#undef __cxa_vec_delete3
+#undef __cxa_pure_virtual
+
+// math_functions.hpp expects this host function be defined on MacOS, but it
+// ends up not being there because of the games we play here.  Just define it
+// ourselves; it's simple enough.
+#ifdef __APPLE__
+inline __host__ double __signbitd(double x) {
+  return std::signbit(x);
+}
+#endif
+
+// We need decls for functions in CUDA's libdevice with __device__
+// attribute only. Alas they come either as __host__ __device__ or
+// with no attributes at all. To work around that, define __CUDA_RTC__
+// which produces HD variant and undef __host__ which gives us desided
+// decls with __device__ attribute.
+#pragma push_macro("__host__")
+#define __host__
+#define __CUDACC_RTC__
+#include "device_functions_decls.h"
+#undef __CUDACC_RTC__
+
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
+// Previous versions used to check whether they are defined or not.
+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
+// here to detect the switch.
+
+#if defined(CU_DEVICE_INVALID)
+#if !defined(__USE_FAST_MATH__)
+#define __USE_FAST_MATH__ 0
+#endif
+
+#if !defined(__CUDA_PREC_DIV)
+#define __CUDA_PREC_DIV 0
+#endif
+#endif
+
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#include "device_functions.hpp"
+
+// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
+// get the slow-but-accurate or fast-but-inaccurate versions of functions like
+// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+//
+// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
+// slow divides), so we need to scope our define carefully here.
+#pragma push_macro("__USE_FAST_MATH__")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __USE_FAST_MATH__ 1
+#endif
+#include "math_functions.hpp"
+#pragma pop_macro("__USE_FAST_MATH__")
+
+#include "math_functions_dbl_ptx3.hpp"
+#pragma pop_macro("__forceinline__")
+
+// Pull in host-only functions that are only available when neither
+// __CUDACC__ nor __CUDABE__ are defined.
+#undef __MATH_FUNCTIONS_HPP__
+#undef __CUDABE__
+#include "math_functions.hpp"
+// Alas, additional overloads for these functions are hard to get to.
+// Considering that we only need these overloads for a few functions,
+// we can provide them here.
+static inline float rsqrt(float __a) { return rsqrtf(__a); }
+static inline float rcbrt(float __a) { return rcbrtf(__a); }
+static inline float sinpi(float __a) { return sinpif(__a); }
+static inline float cospi(float __a) { return cospif(__a); }
+static inline void sincospi(float __a, float *__b, float *__c) {
+  return sincospif(__a, __b, __c);
+}
+static inline float erfcinv(float __a) { return erfcinvf(__a); }
+static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
+static inline float normcdf(float __a) { return normcdff(__a); }
+static inline float erfcx(float __a) { return erfcxf(__a); }
+
+// For some reason single-argument variant is not always declared by
+// CUDA headers. Alas, device_functions.hpp included below needs it.
+static inline __device__ void __brkpt(int __c) { __brkpt(); }
+
+// Now include *.hpp with definitions of various GPU functions.  Alas,
+// a lot of thins get declared/defined with __host__ attribute which
+// we don't want and we have to define it out. We also have to include
+// {device,math}_functions.hpp again in order to extract the other
+// branch of #if/else inside.
+
+#define __host__
+#undef __CUDABE__
+#define __CUDACC__
+#undef __DEVICE_FUNCTIONS_HPP__
+#include "device_atomic_functions.hpp"
+#include "device_functions.hpp"
+#include "sm_20_atomic_functions.hpp"
+#include "sm_20_intrinsics.hpp"
+#include "sm_32_atomic_functions.hpp"
+
+// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
+// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
+// define them using builtins so that the optimizer can reason about and across
+// these instructions.  In particular, using intrinsics for ldg gets us the
+// [addr+imm] addressing mode, which, although it doesn't actually exist in the
+// hardware, seems to generate faster machine code because ptxas can more easily
+// reason about our code.
+
+#if CUDA_VERSION >= 8000
+#include "sm_60_atomic_functions.hpp"
+#include "sm_61_intrinsics.hpp"
+#endif
+
+#undef __MATH_FUNCTIONS_HPP__
+
+// math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
+// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
+// math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
+// conditional on __GNUC__.  :)
+#pragma push_macro("signbit")
+#pragma push_macro("__GNUC__")
+#undef __GNUC__
+#define signbit __ignored_cuda_signbit
+#include "math_functions.hpp"
+#pragma pop_macro("__GNUC__")
+#pragma pop_macro("signbit")
+
+#pragma pop_macro("__host__")
+
+#include "texture_indirect_functions.h"
+
+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
+#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("__THROW")
+
+// Set up compiler macros expected to be seen during compilation.
+#undef __CUDABE__
+#define __CUDACC__
+
+extern "C" {
+// Device-side CUDA system calls.
+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
+// We need these declarations and wrappers for device-side
+// malloc/free/printf calls to work without relying on
+// -fcuda-disable-target-call-checks option.
+__device__ int vprintf(const char *, const char *);
+__device__ void free(void *) __attribute((nothrow));
+__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
+__device__ void __assertfail(const char *__message, const char *__file,
+                             unsigned __line, const char *__function,
+                             size_t __charSize) __attribute__((noreturn));
+
+// In order for standard assert() macro on linux to work we need to
+// provide device-side __assert_fail()
+__device__ static inline void __assert_fail(const char *__message,
+                                            const char *__file, unsigned __line,
+                                            const char *__function) {
+  __assertfail(__message, __file, __line, __function, sizeof(char));
+}
+
+// Clang will convert printf into vprintf, but we still need
+// device-side declaration for it.
+__device__ int printf(const char *, ...);
+} // extern "C"
+
+// We also need device-side std::malloc and std::free.
+namespace std {
+__device__ static inline void free(void *__ptr) { ::free(__ptr); }
+__device__ static inline void *malloc(size_t __size) {
+  return ::malloc(__size);
+}
+} // namespace std
+
+// Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
+// come after we've pulled in the definition of uint3 and dim3.
+
+__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+#include <__clang_cuda_cmath.h>
+#include <__clang_cuda_intrinsics.h>
+#include <__clang_cuda_complex_builtins.h>
+
+// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
+// mode, giving them their "proper" types of dim3 and uint3.  This is
+// incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
+// hack, force-include the header (nvcc doesn't include it by default) but
+// redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
+// only used here for the redeclarations of blockDim and threadIdx.)
+#pragma push_macro("dim3")
+#pragma push_macro("uint3")
+#define dim3 __cuda_builtin_blockDim_t
+#define uint3 __cuda_builtin_threadIdx_t
+#include "curand_mtgp32_kernel.h"
+#pragma pop_macro("dim3")
+#pragma pop_macro("uint3")
+#pragma pop_macro("__USE_FAST_MATH__")
+
+#endif // __CUDA__
+#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__stddef_max_align_t.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__stddef_max_align_t.h
new file mode 100644
index 000000000..1e10ca986
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__stddef_max_align_t.h
@@ -0,0 +1,43 @@
+/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
+ *
+ * Copyright (c) 2014 Chandler Carruth
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_MAX_ALIGN_T_DEFINED
+#define __CLANG_MAX_ALIGN_T_DEFINED
+
+#if defined(_MSC_VER)
+typedef double max_align_t;
+#elif defined(__APPLE__)
+typedef long double max_align_t;
+#else
+// Define 'max_align_t' to match the GCC definition.
+typedef struct {
+  long long __clang_max_align_nonce1
+      __attribute__((__aligned__(__alignof__(long long))));
+  long double __clang_max_align_nonce2
+      __attribute__((__aligned__(__alignof__(long double))));
+} max_align_t;
+#endif
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__wmmintrin_aes.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__wmmintrin_aes.h
new file mode 100644
index 000000000..3a2ee1b2e
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__wmmintrin_aes.h
@@ -0,0 +1,151 @@
+/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_AES_H
+#define _WMMINTRIN_AES_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
+
+/// \brief Performs a single round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenc_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs the final round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs a single round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdec_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs the final round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdeclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Applies the AES InvMixColumns() transformation to an expanded key
+///    contained in the source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the expanded key.
+/// \returns A 128-bit integer vector containing the transformed value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesimc_si128(__m128i __V)
+{
+  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
+}
+
+/// \brief Generates a round key for AES encyption, operating on 128-bit data
+///    specified in the first source operand and using an 8-bit round constant
+///    specified by the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
+///
+/// \param C
+///    A 128-bit integer vector that is used to generate the AES encryption key.
+/// \param R
+///    An 8-bit round constant used to generate the AES encryption key.
+/// \returns A 128-bit round key for AES encryption.
+#define _mm_aeskeygenassist_si128(C, R) \
+  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif  /* _WMMINTRIN_AES_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__wmmintrin_pclmul.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__wmmintrin_pclmul.h
new file mode 100644
index 000000000..e9c6a9f6d
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/__wmmintrin_pclmul.h
@@ -0,0 +1,57 @@
+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_PCLMUL_H
+#define _WMMINTRIN_PCLMUL_H
+
+/// \brief Multiplies two 64-bit integer values, which are selected from source
+///    operands using the immediate-value operand. The multiplication is a
+///    carry-less multiplication, and the 128-bit integer product is stored in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
+///
+/// \param __X
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __Y
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __I
+///    An immediate value specifying which 64-bit values to select from the
+///    operands. Bit 0 is used to select a value from operand \a __X, and bit
+///    4 is used to select a value from operand \a __Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
+/// \returns The 128-bit integer vector containing the result of the carry-less
+///    multiplication of the selected 64-bit values.
+#define _mm_clmulepi64_si128(__X, __Y, __I) \
+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
+                                        (__v2di)(__m128i)(__Y), (char)(__I)))
+
+#endif /* _WMMINTRIN_PCLMUL_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/adxintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/adxintrin.h
new file mode 100644
index 000000000..ee3472841
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/adxintrin.h
@@ -0,0 +1,86 @@
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Intrinsics that are available only if __ADX__ defined */
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+               unsigned int *__p)
+{
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+/* Intrinsics that are also available if __ADX__ undefined */
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
+}
+#endif
+
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADXINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/altivec.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/altivec.h
new file mode 100644
index 000000000..a01d9d837
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/altivec.h
@@ -0,0 +1,16733 @@
+/*===---- altivec.h - Standard header for type generic math ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __ALTIVEC_H
+#define __ALTIVEC_H
+
+#ifndef __ALTIVEC__
+#error "AltiVec support not enabled"
+#endif
+
+/* Constants for mapping CR6 bits to predicate result. */
+
+#define __CR6_EQ 0
+#define __CR6_EQ_REV 1
+#define __CR6_LT 2
+#define __CR6_LT_REV 3
+
+/* Constants for vec_test_data_class */
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+                                  __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_ZERO_N (1<<2)
+#define __VEC_CLASS_FP_ZERO_P (1<<3)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P           | \
+                             __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_INFINITY_N (1<<4)
+#define __VEC_CLASS_FP_INFINITY_P (1<<5)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P   | \
+                                 __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_NAN (1<<6)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN        | \
+                                   __VEC_CLASS_FP_SUBNORMAL  | \
+                                   __VEC_CLASS_FP_ZERO       | \
+                                   __VEC_CLASS_FP_INFINITY)
+
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+#ifdef __POWER9_VECTOR__
+#include <stddef.h>
+#endif
+
+static __inline__ vector signed char __ATTRS_o_ai vec_perm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c);
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c);
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c);
+
+static __inline__ vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                                     vector signed short __b,
+                                                     vector unsigned char __c);
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c);
+
+static __inline__ vector bool short __ATTRS_o_ai vec_perm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c);
+
+static __inline__ vector pixel __ATTRS_o_ai vec_perm(vector pixel __a,
+                                                     vector pixel __b,
+                                                     vector unsigned char __c);
+
+static __inline__ vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                                   vector signed int __b,
+                                                   vector unsigned char __c);
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_perm(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned char __c);
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c);
+
+static __inline__ vector float __ATTRS_o_ai vec_perm(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned char __c);
+
+#ifdef __VSX__
+static __inline__ vector long long __ATTRS_o_ai
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c);
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c);
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c);
+
+static __inline__ vector double __ATTRS_o_ai vec_perm(vector double __a,
+                                                      vector double __b,
+                                                      vector unsigned char __c);
+#endif
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b);
+
+/* vec_abs */
+
+#define __builtin_altivec_abs_v16qi vec_abs
+#define __builtin_altivec_abs_v8hi vec_abs
+#define __builtin_altivec_abs_v4si vec_abs
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_abs(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(__a, -__a);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_abs(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(__a, -__a);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_abs(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(__a, -__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_abs(vector signed long long __a) {
+  return __builtin_altivec_vmaxsd(__a, -__a);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_abs(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvabssp(__a);
+#else
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)(0x7FFFFFFF);
+  return (vector float)__res;
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_abs(vector double __a) {
+  return __builtin_vsx_xvabsdp(__a);
+}
+#endif
+
+/* vec_abss */
+#define __builtin_altivec_abss_v16qi vec_abss
+#define __builtin_altivec_abss_v8hi vec_abss
+#define __builtin_altivec_abss_v4si vec_abss
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_abss(vector signed char __a) {
+  return __builtin_altivec_vmaxsb(
+      __a, __builtin_altivec_vsubsbs((vector signed char)(0), __a));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_abss(vector signed short __a) {
+  return __builtin_altivec_vmaxsh(
+      __a, __builtin_altivec_vsubshs((vector signed short)(0), __a));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_abss(vector signed int __a) {
+  return __builtin_altivec_vmaxsw(
+      __a, __builtin_altivec_vsubsws((vector signed int)(0), __a));
+}
+
+/* vec_absd */
+#if defined(__POWER9_VECTOR__)
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_absd(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vabsdub(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_absd(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vabsduh(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_absd(vector unsigned int __a,  vector unsigned int __b) {
+  return __builtin_altivec_vabsduw(__a, __b);
+}
+
+#endif /* End __POWER9_VECTOR__ */
+
+/* vec_add */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector signed char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_add(vector signed char __a, vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector unsigned char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_add(vector unsigned char __a, vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_add(vector short __a,
+                                                    vector short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_add(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_add(vector short __a,
+                                                    vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector unsigned short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_add(vector unsigned short __a, vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_add(vector int __a,
+                                                  vector int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_add(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_add(vector int __a,
+                                                  vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector unsigned int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_add(vector unsigned int __a, vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_add(vector signed long long __a, vector signed long long __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_add(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a + __b;
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_add(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_add(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+static __inline__ vector float __ATTRS_o_ai vec_add(vector float __a,
+                                                    vector float __b) {
+  return __a + __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_add(vector double __a,
+                                                     vector double __b) {
+  return __a + __b;
+}
+#endif // __VSX__
+
+/* vec_adde */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_adde(vector signed __int128 __a, vector signed __int128 __b,
+         vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_adde(vector unsigned __int128 __a, vector unsigned __int128 __b,
+         vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+#endif
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_adde(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adde(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_add(vec_add(__a, __b), __carry);
+}
+
+/* vec_addec */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_addec(vector signed __int128 __a, vector signed __int128 __b,
+          vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_addec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+          vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addec(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+
+  signed int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempa = (unsigned int) __a[i];
+    unsigned int __tempb = (unsigned int) __b[i];
+    unsigned int __tempc = (unsigned int) __c[i];
+    __tempc = __tempc & 0x00000001;
+    unsigned long long __longa = (unsigned long long) __tempa;
+    unsigned long long __longb = (unsigned long long) __tempb;
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector signed int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addec(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+
+  unsigned int __result[4];
+  for (int i = 0; i < 4; i++) {
+    unsigned int __tempc = __c[i] & 1;
+    unsigned long long __longa = (unsigned long long) __a[i];
+    unsigned long long __longb = (unsigned long long) __b[i];
+    unsigned long long __longc = (unsigned long long) __tempc;
+    unsigned long long __sum = __longa + __longb + __longc;
+    unsigned long long __res = (__sum >> 32) & 0x01;
+    unsigned long long __tempres = (unsigned int) __res;
+    __result[i] = (signed int) __tempres;
+  }
+
+  vector unsigned int ret = { __result[0], __result[1], __result[2], __result[3] };
+  return ret;
+}
+
+#endif
+
+/* vec_vaddubm */
+
+#define __builtin_altivec_vaddubm vec_vaddubm
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector signed char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a + __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddubm(vector signed char __a, vector bool char __b) {
+  return __a + (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector unsigned char __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a + __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubm(vector unsigned char __a, vector bool char __b) {
+  return __a + (vector unsigned char)__b;
+}
+
+/* vec_vadduhm */
+
+#define __builtin_altivec_vadduhm vec_vadduhm
+
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                                        vector short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector bool short __a,
+                                                        vector short __b) {
+  return (vector short)__a + __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vadduhm(vector short __a,
+                                                        vector bool short __b) {
+  return __a + (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a + __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhm(vector unsigned short __a, vector bool short __b) {
+  return __a + (vector unsigned short)__b;
+}
+
+/* vec_vadduwm */
+
+#define __builtin_altivec_vadduwm vec_vadduwm
+
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                                      vector int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector bool int __a,
+                                                      vector int __b) {
+  return (vector int)__a + __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vadduwm(vector int __a,
+                                                      vector bool int __b) {
+  return __a + (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector unsigned int __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a + __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduwm(vector unsigned int __a, vector bool int __b) {
+  return __a + (vector unsigned int)__b;
+}
+
+/* vec_vaddfp */
+
+#define __builtin_altivec_vaddfp vec_vaddfp
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vaddfp(vector float __a, vector float __b) {
+  return __a + __b;
+}
+
+/* vec_addc */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_addc(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vaddcuw((vector unsigned int)__a,
+                                                      (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_addc(vector signed __int128 __a, vector signed __int128 __b) {
+  return (vector signed __int128)__builtin_altivec_vaddcuq(
+      (vector unsigned __int128)__a, (vector unsigned __int128)__b);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_addc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vaddcuw */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vaddcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vaddcuw(__a, __b);
+}
+
+/* vec_adds */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_adds(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_adds(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector bool short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_adds(vector short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_adds(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector bool int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_adds(vector int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_adds(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vaddsbs */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vaddsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vaddsbs(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vaddsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vaddubs */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vaddubs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vaddubs(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vaddubs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vaddshs */
+
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vaddshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector bool short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vaddshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vaddshs(vector short __a,
+                                                        vector bool short __b) {
+  return __builtin_altivec_vaddshs(__a, (vector short)__b);
+}
+
+/* vec_vadduhs */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vadduhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vadduhs(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vadduhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vaddsws */
+
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vaddsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector bool int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vaddsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vaddsws(vector int __a,
+                                                      vector bool int __b) {
+  return __builtin_altivec_vaddsws(__a, (vector int)__b);
+}
+
+/* vec_vadduws */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vadduws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vadduws(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vadduws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vadduqm */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vadduqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a + __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vadduqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a + __b;
+}
+
+/* vec_vaddeuqm */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vaddeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddeuqm(__a, __b, __c);
+}
+
+/* vec_vaddcuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vaddcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vaddcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vaddcuq(__a, __b);
+}
+
+/* vec_vaddecuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vaddecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vaddecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vaddecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_and */
+
+#define __builtin_altivec_vand vec_and
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector signed char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_and(vector signed char __a, vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector unsigned char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_and(vector unsigned char __a, vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_and(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_and(vector short __a,
+                                                    vector short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_and(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_and(vector short __a,
+                                                    vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector unsigned short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_and(vector unsigned short __a, vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_and(vector bool short __a, vector bool short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_and(vector int __a,
+                                                  vector int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_and(vector int __a,
+                                                  vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector unsigned int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_and(vector unsigned int __a, vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_and(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_and(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_and(vector bool int __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_and(vector float __a,
+                                                    vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_and(vector bool long long __a,
+                                                     vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_and(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_and(vector double __a,
+                                                     vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_and(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_and(vector signed long long __a, vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_and(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_and(vector bool long long __a, vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_vand */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector signed char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vand(vector signed char __a, vector bool char __b) {
+  return __a & (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector unsigned char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vand(vector unsigned char __a, vector bool char __b) {
+  return __a & (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vand(vector bool char __a,
+                                                         vector bool char __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                                     vector short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector bool short __a,
+                                                     vector short __b) {
+  return (vector short)__a & __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vand(vector short __a,
+                                                     vector bool short __b) {
+  return __a & (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector unsigned short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vand(vector unsigned short __a, vector bool short __b) {
+  return __a & (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vand(vector bool short __a, vector bool short __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector int __a,
+                                                   vector int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                   vector int __b) {
+  return (vector int)__a & __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vand(vector int __a,
+                                                   vector bool int __b) {
+  return __a & (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector unsigned int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vand(vector unsigned int __a, vector bool int __b) {
+  return __a & (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                        vector bool int __b) {
+  return __a & __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector bool int __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vand(vector float __a,
+                                                     vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector signed long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vand(vector signed long long __a, vector bool long long __b) {
+  return __a & (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vand(vector unsigned long long __a, vector bool long long __b) {
+  return __a & (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vand(vector bool long long __a, vector bool long long __b) {
+  return __a & __b;
+}
+#endif
+
+/* vec_andc */
+
+#define __builtin_altivec_vandc vec_andc
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_andc(vector signed char __a, vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_andc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_andc(vector bool char __a,
+                                                         vector bool char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                                     vector short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector bool short __a,
+                                                     vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_andc(vector short __a,
+                                                     vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_andc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_andc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector int __a,
+                                                   vector int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                   vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_andc(vector int __a,
+                                                   vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_andc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                        vector bool int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector bool int __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_andc(vector float __a,
+                                                     vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_andc(vector bool long long __a,
+                                                      vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_andc(vector double __a, vector bool long long __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_andc(vector double __a,
+                                                      vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a & ~(vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_andc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_vandc */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a & ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vandc(vector signed char __a, vector bool char __b) {
+  return __a & ~(vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a & ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vandc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~(vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vandc(vector bool char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                                      vector short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector bool short __a,
+                                                      vector short __b) {
+  return (vector short)__a & ~__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vandc(vector short __a,
+                                                      vector bool short __b) {
+  return __a & ~(vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a & ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vandc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~(vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vandc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector int __a,
+                                                    vector int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                    vector int __b) {
+  return (vector int)__a & ~__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vandc(vector int __a,
+                                                    vector bool int __b) {
+  return __a & ~(vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a & ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vandc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~(vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                         vector bool int __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                                      vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector bool int __a,
+                                                      vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vandc(vector float __a,
+                                                      vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a & ~(vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a & ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vandc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~(vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a & ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vandc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~(vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vandc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+#endif
+
+/* vec_avg */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_avg(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_avg(vector short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_avg(vector int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_vavgsb */
+
+static __inline__ vector signed char __attribute__((__always_inline__))
+vec_vavgsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vavgsb(__a, __b);
+}
+
+/* vec_vavgub */
+
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+vec_vavgub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vavgub(__a, __b);
+}
+
+/* vec_vavgsh */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vavgsh(vector short __a, vector short __b) {
+  return __builtin_altivec_vavgsh(__a, __b);
+}
+
+/* vec_vavguh */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vavguh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vavguh(__a, __b);
+}
+
+/* vec_vavgsw */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vavgsw(vector int __a, vector int __b) {
+  return __builtin_altivec_vavgsw(__a, __b);
+}
+
+/* vec_vavguw */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vavguw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vavguw(__a, __b);
+}
+
+/* vec_ceil */
+
+static __inline__ vector float __ATTRS_o_ai vec_ceil(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspip(__a);
+#else
+  return __builtin_altivec_vrfip(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_ceil(vector double __a) {
+  return __builtin_vsx_xvrdpip(__a);
+}
+#endif
+
+/* vec_vrfip */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfip(vector float __a) {
+  return __builtin_altivec_vrfip(__a);
+}
+
+/* vec_cmpb */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_cmpb(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_vcmpbfp */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vcmpbfp(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp(__a, __b);
+}
+
+/* vec_cmpeq */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpequb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_cmpeq(vector short __a,
+                                                           vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpequh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector int __a,
+                                                         vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector bool int __a,
+                                                         vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpequw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(__a, __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpequd(
+      (vector long long)__a, (vector long long)__b);
+}
+
+#endif
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpeq(vector float __a,
+                                                         vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpeqsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpeqfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpeqdp(__a, __b);
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+/* vec_cmpne */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpneb((vector char)__a,
+                                                     (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpneh((vector short)__a,
+                                                      (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector double __a, vector double __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+/* vec_cmpnez */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpnez(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpnezb((vector char)__a,
+                                                      (vector char)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector signed short __a, vector signed short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpnez(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpnezh((vector short)__a,
+                                                       (vector short)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector signed int __a, vector signed int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpnez(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpnezw((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cntlz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vctzlsbb(__a);
+#else
+  return __builtin_altivec_vclzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ signed int __ATTRS_o_ai
+vec_cnttz_lsbb(vector unsigned char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vclzlsbb(__a);
+#else
+  return __builtin_altivec_vctzlsbb(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_parity_lsbb(vector signed int __a) {
+  return __builtin_altivec_vprtybw(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_parity_lsbb(vector signed __int128 __a) {
+  return __builtin_altivec_vprtybq(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector unsigned long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_parity_lsbb(vector signed long long __a) {
+  return __builtin_altivec_vprtybd(__a);
+}
+
+#endif
+
+/* vec_cmpgt */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpgt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_cmpgt(vector short __a,
+                                                           vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpgt(vector int __a,
+                                                         vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtsd(__a, __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)__builtin_altivec_vcmpgtud(__a, __b);
+}
+#endif
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpgt(vector float __a,
+                                                         vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgtsp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgtdp(__a, __b);
+}
+#endif
+
+/* vec_cmpge */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpge(vector signed char __a, vector signed char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpge(vector signed short __a, vector signed short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpge(vector signed int __a, vector signed int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmpge(vector float __a,
+                                                         vector float __b) {
+#ifdef __VSX__
+  return (vector bool int)__builtin_vsx_xvcmpgesp(__a, __b);
+#else
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)__builtin_vsx_xvcmpgedp(__a, __b);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(vec_cmpgt(__b, __a));
+}
+#endif
+
+/* vec_vcmpgefp */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgefp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgefp(__a, __b);
+}
+
+/* vec_vcmpgtsb */
+
+static __inline__ vector bool char __attribute__((__always_inline__))
+vec_vcmpgtsb(vector signed char __a, vector signed char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtsb(__a, __b);
+}
+
+/* vec_vcmpgtub */
+
+static __inline__ vector bool char __attribute__((__always_inline__))
+vec_vcmpgtub(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vcmpgtub(__a, __b);
+}
+
+/* vec_vcmpgtsh */
+
+static __inline__ vector bool short __attribute__((__always_inline__))
+vec_vcmpgtsh(vector short __a, vector short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtsh(__a, __b);
+}
+
+/* vec_vcmpgtuh */
+
+static __inline__ vector bool short __attribute__((__always_inline__))
+vec_vcmpgtuh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vcmpgtuh(__a, __b);
+}
+
+/* vec_vcmpgtsw */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgtsw(vector int __a, vector int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtsw(__a, __b);
+}
+
+/* vec_vcmpgtuw */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgtuw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtuw(__a, __b);
+}
+
+/* vec_vcmpgtfp */
+
+static __inline__ vector bool int __attribute__((__always_inline__))
+vec_vcmpgtfp(vector float __a, vector float __b) {
+  return (vector bool int)__builtin_altivec_vcmpgtfp(__a, __b);
+}
+
+/* vec_cmple */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmple(vector signed char __a, vector signed char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmple(vector signed short __a, vector signed short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmple(vector signed int __a, vector signed int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmple(vector float __a,
+                                                         vector float __b) {
+  return vec_cmpge(__b, __a);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmple(vector double __a, vector double __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpge(__b, __a);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpge(__b, __a);
+}
+#endif
+
+/* vec_cmplt */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmplt(vector signed char __a, vector signed char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_cmplt(vector short __a,
+                                                           vector short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmplt(vector int __a,
+                                                         vector int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_cmplt(vector float __a,
+                                                         vector float __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmplt(vector double __a, vector double __b) {
+  return vec_cmpgt(__b, __a);
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_cmpgt(__b, __a);
+}
+
+/* vec_popcnt */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_popcnt(vector signed char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_altivec_vpopcntb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_popcnt(vector signed short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_altivec_vpopcnth(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_popcnt(vector signed int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_altivec_vpopcntw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_popcnt(vector signed long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_altivec_vpopcntd(__a);
+}
+
+/* vec_cntlz */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cntlz(vector signed char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cntlz(vector unsigned char __a) {
+  return __builtin_altivec_vclzb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cntlz(vector signed short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cntlz(vector unsigned short __a) {
+  return __builtin_altivec_vclzh(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cntlz(vector signed int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cntlz(vector unsigned int __a) {
+  return __builtin_altivec_vclzw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cntlz(vector signed long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_altivec_vclzd(__a);
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+
+/* vec_cnttz */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_cnttz(vector signed char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_altivec_vctzb(__a);
+}
+static __inline__ vector signed short __ATTRS_o_ai
+vec_cnttz(vector signed short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_altivec_vctzh(__a);
+}
+static __inline__ vector signed int __ATTRS_o_ai
+vec_cnttz(vector signed int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_altivec_vctzw(__a);
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cnttz(vector signed long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_altivec_vctzd(__a);
+}
+
+/* vec_first_match_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpeq(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpeq(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_match_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed char __a, vector signed char __b) {
+  /* Compare the result of the comparison of two vectors with either and OR the
+     result. Either the elements are equal or one will equal the comparison
+     result if either is zero.
+  */
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector signed char)__tmp1, __a) |
+                            vec_cmpeq((vector signed char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned char __a,
+                             vector unsigned char __b) {
+  vector bool char __tmp1 = vec_cmpeq(__a, __b);
+  vector bool char __tmp2 = __tmp1 |
+                            vec_cmpeq((vector unsigned char)__tmp1, __a) |
+                            vec_cmpeq((vector unsigned char)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed short __a, vector signed short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector signed short)__tmp1, __a) |
+                             vec_cmpeq((vector signed short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned short __a,
+                             vector unsigned short __b) {
+  vector bool short __tmp1 = vec_cmpeq(__a, __b);
+  vector bool short __tmp2 = __tmp1 |
+                             vec_cmpeq((vector unsigned short)__tmp1, __a) |
+                             vec_cmpeq((vector unsigned short)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 | vec_cmpeq((vector signed int)__tmp1, __a) |
+                           vec_cmpeq((vector signed int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+      vec_cnttz((vector unsigned long long)__tmp2);
+#else
+      vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_match_or_eos_index(vector unsigned int __a, vector unsigned int __b) {
+  vector bool int __tmp1 = vec_cmpeq(__a, __b);
+  vector bool int __tmp2 = __tmp1 |
+                           vec_cmpeq((vector unsigned int)__tmp1, __a) |
+                           vec_cmpeq((vector unsigned int)__tmp1, __b);
+
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)__tmp2);
+#else
+    vec_cntlz((vector unsigned long long)__tmp2);
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed char __a, vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned char __a, vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed short __a, vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_index(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpne(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpne(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+/* vec_first_mismatch_or_eos_index */
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed char __a,
+                                vector signed char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned char __a,
+                                vector unsigned char __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 3;
+  }
+  return __res[0] >> 3;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed short __a,
+                                vector signed short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned short __a,
+                                vector unsigned short __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 4;
+  }
+  return __res[0] >> 4;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector signed int __a, vector signed int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ unsigned __ATTRS_o_ai
+vec_first_mismatch_or_eos_index(vector unsigned int __a,
+                                vector unsigned int __b) {
+  vector unsigned long long __res =
+#ifdef __LITTLE_ENDIAN__
+    vec_cnttz((vector unsigned long long)vec_cmpnez(__a, __b));
+#else
+    vec_cntlz((vector unsigned long long)vec_cmpnez(__a, __b));
+#endif
+  if (__res[0] == 64) {
+    return (__res[1] + 64) >> 5;
+  }
+  return __res[0] >> 5;
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector double __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp((vector unsigned long long)__a,__b);
+}
+
+static __inline__ vector double  __ATTRS_o_ai
+vec_insert_exp(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_vsx_xviexpdp(__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector float __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp((vector unsigned int)__a,__b);
+}
+
+static __inline__ vector float  __ATTRS_o_ai
+vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_vsx_xviexpsp(__a,__b);
+}
+
+#if defined(__powerpc64__)
+static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
+                                                             size_t __b) {
+  return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_len(unsigned char *__a, size_t __b) {
+  return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
+                                                              size_t __b) {
+  return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_len(unsigned short *__a, size_t __b) {
+  return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
+                                                            size_t __b) {
+  return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
+                                                              size_t __b) {
+  return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
+  return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_len(signed __int128 *__a, size_t __b) {
+  return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_len(unsigned __int128 *__a, size_t __b) {
+  return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_len(signed long long *__a, size_t __b) {
+  return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_len(unsigned long long *__a, size_t __b) {
+  return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
+                                                        size_t __b) {
+  return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a,
+                                                          size_t __b) {
+  vector unsigned char __res =
+      (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
+  __res = (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__res, (vector int)__res, __mask);
+#endif
+  return __res;
+}
+
+// vec_xst_len
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned char __a,
+                                                unsigned char *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed char __a,
+                                                signed char *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed short __a,
+                                                signed short *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned short __a,
+                                                unsigned short *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed int __a,
+                                                signed int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned int __a,
+                                                unsigned int *__b, size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector float __a, float *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed __int128 __a,
+                                                signed __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned __int128 __a,
+                                                unsigned __int128 *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector signed long long __a,
+                                                signed long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector unsigned long long __a,
+                                                unsigned long long *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
+                                                size_t __c) {
+  return __builtin_vsx_stxvl((vector int)__a, __b, (__c << 56));
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
+                                                  unsigned char *__b,
+                                                  size_t __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __mask =
+      (vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
+  vector unsigned char __res =
+      __builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
+  return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
+#else
+  return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
+#endif
+}
+#endif
+#endif
+
+/* vec_cpsgn */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_cpsgn(vector float __a,
+                                                      vector float __b) {
+  return __builtin_vsx_xvcpsgnsp(__a, __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
+                                                       vector double __b) {
+  return __builtin_vsx_xvcpsgndp(__a, __b);
+}
+#endif
+
+/* vec_ctf */
+
+static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
+                                                    int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai
+vec_ctf(vector unsigned long long __a, int __b) {
+  vector double __ret = __builtin_convertvector(__a, vector double);
+  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_ctf(vector signed long long __a, int __b) {
+  vector double __ret = __builtin_convertvector(__a, vector double);
+  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __ret;
+}
+#endif
+
+/* vec_vcfsx */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vcfsx(vector int __a, int __b) {
+  return __builtin_altivec_vcfsx(__a, __b);
+}
+
+/* vec_vcfux */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vcfux(vector unsigned int __a, int __b) {
+  return __builtin_altivec_vcfux((vector int)__a, __b);
+}
+
+/* vec_cts */
+
+static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_cts(vector double __a, int __b) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector signed long long);
+}
+#endif
+
+/* vec_vctsxs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vctsxs(vector float __a, int __b) {
+  return __builtin_altivec_vctsxs(__a, __b);
+}
+
+/* vec_ctu */
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
+                                                           int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+#ifdef __VSX__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_ctu(vector double __a, int __b) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+#endif
+
+/* vec_vctuxs */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vctuxs(vector float __a, int __b) {
+  return __builtin_altivec_vctuxs(__a, __b);
+}
+
+/* vec_signed */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int, vector signed int, unsigned const int __c);
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signed(vector float __a) {
+  return __builtin_convertvector(__a, vector signed int);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_signed(vector double __a) {
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_signed2(vector double __a, vector double __b) {
+  return (vector signed int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsxws(__a);
+#endif
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_signedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsxws(__a);
+#else
+  vector signed int __ret = __builtin_vsx_xvcvdpsxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_unsigned */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sld(vector unsigned int, vector unsigned int, unsigned const int __c);
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsigned(vector float __a) {
+  return __builtin_convertvector(__a, vector unsigned int);
+}
+
+#ifdef __VSX__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_unsigned(vector double __a) {
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_unsigned2(vector double __a, vector double __b) {
+  return (vector unsigned int) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignede(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpuxws(__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unsignedo(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpuxws(__a);
+#else
+  vector unsigned int __ret = __builtin_vsx_xvcvdpuxws(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_float */
+
+static __inline__ vector float __ATTRS_o_ai
+vec_sld(vector float, vector float, unsigned const int __c);
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector signed int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float(vector unsigned int __a) {
+  return __builtin_convertvector(__a, vector float);
+}
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector signed long long __a, vector signed long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_float2(vector double __a, vector double __b) {
+  return (vector float) { __a[0], __a[1], __b[0], __b[1] };
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvsxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvuxdsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floate(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#else
+  return __builtin_vsx_xvcvdpsp(__a);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector signed long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvsxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector unsigned long long __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxdsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvuxdsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_floato(vector double __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvdpsp(__a);
+#else
+  vector float __ret = __builtin_vsx_xvcvdpsp(__a);
+  return vec_sld(__ret, __ret, 12);
+#endif
+}
+#endif
+
+/* vec_double */
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector signed long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_double(vector unsigned long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvsxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvuxwdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublee(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#else
+  return __builtin_vsx_xvcvspdp(__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector signed int __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector unsigned int __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleh(vector float __a) {
+  vector double __ret = {__a[0], __a[1]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector signed int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector unsigned int __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doublel(vector float __a) {
+  vector double __ret = {__a[2], __a[3]};
+  return __ret;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector signed int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvsxwdp(__a);
+#else
+  return __builtin_vsx_xvcvsxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector unsigned int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvuxwdp(__a);
+#else
+  return __builtin_vsx_xvcvuxwdp(vec_sld(__a, __a, 4));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_doubleo(vector float __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_vsx_xvcvspdp(__a);
+#else
+  return __builtin_vsx_xvcvspdp(vec_sld(__a, __a, 4));
+#endif
+}
+#endif
+
+/* vec_div */
+
+/* Integer vector divides (vectors are scalarized, elements divided
+   and the vectors reassembled).
+*/
+static __inline__ vector signed char __ATTRS_o_ai
+vec_div(vector signed char __a, vector signed char __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_div(vector unsigned char __a, vector unsigned char __b) {
+  return __a / __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_div(vector signed short __a, vector signed short __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_div(vector unsigned short __a, vector unsigned short __b) {
+  return __a / __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_div(vector signed int __a, vector signed int __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_div(vector unsigned int __a, vector unsigned int __b) {
+  return __a / __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_div(vector signed long long __a, vector signed long long __b) {
+  return __a / __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_div(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a / __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_div(vector float __a,
+                                                    vector float __b) {
+  return __a / __b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a,
+                                                     vector double __b) {
+  return __a / __b;
+}
+#endif
+
+/* vec_dss */
+
+static __inline__ void __attribute__((__always_inline__)) vec_dss(int __a) {
+  __builtin_altivec_dss(__a);
+}
+
+/* vec_dssall */
+
+static __inline__ void __attribute__((__always_inline__)) vec_dssall(void) {
+  __builtin_altivec_dssall();
+}
+
+/* vec_dst */
+#define vec_dst(__PTR, __CW, __STR) \
+  __extension__(                    \
+      { __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_dstst */
+#define vec_dstst(__PTR, __CW, __STR) \
+  __extension__(                      \
+      { __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_dststt */
+#define vec_dststt(__PTR, __CW, __STR) \
+  __extension__(                       \
+      { __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_dstt */
+#define vec_dstt(__PTR, __CW, __STR) \
+  __extension__(                     \
+      { __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR)); })
+
+/* vec_eqv */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_eqv(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                  (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_eqv(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                    (vector unsigned int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_eqv(vector bool char __a,
+                                                        vector bool char __b) {
+  return (vector bool char)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                (vector unsigned int)__b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_eqv(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                   (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_eqv(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_eqv(vector bool short __a, vector bool short __b) {
+  return (vector bool short)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_eqv(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                 (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_eqv(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_vsx_xxleqv(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_eqv(vector bool int __a,
+                                                       vector bool int __b) {
+  return (vector bool int)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                               (vector unsigned int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_vsx_xxleqv(
+      (vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_vsx_xxleqv(
+      (vector unsigned int)__a, (vector unsigned int)__b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                                     (vector unsigned int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_eqv(vector float __a,
+                                                    vector float __b) {
+  return (vector float)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                            (vector unsigned int)__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_eqv(vector double __a,
+                                                     vector double __b) {
+  return (vector double)__builtin_vsx_xxleqv((vector unsigned int)__a,
+                                             (vector unsigned int)__b);
+}
+#endif
+
+/* vec_expte */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_expte(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_vexptefp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vexptefp(vector float __a) {
+  return __builtin_altivec_vexptefp(__a);
+}
+
+/* vec_floor */
+
+static __inline__ vector float __ATTRS_o_ai vec_floor(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspim(__a);
+#else
+  return __builtin_altivec_vrfim(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_floor(vector double __a) {
+  return __builtin_vsx_xvrdpim(__a);
+}
+#endif
+
+/* vec_vrfim */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfim(vector float __a) {
+  return __builtin_altivec_vrfim(__a);
+}
+
+/* vec_ld */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ld(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ld(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ld(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_ld(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ld(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ld(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_ld(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ld(int __a,
+                                                 const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ld(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ld(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_ld(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ld(int __a,
+                                                   const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ld(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lvx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvx(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvx(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvx(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvx(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvx(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvx(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvx(int __a,
+                                                  const vector int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvx(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvx(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvx(int __a,
+                                                    const vector float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvx(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvx(__a, __b);
+}
+
+/* vec_lde */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lde(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lde(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lde(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lde(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lde(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lde(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lde(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_lvebx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvebx(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvebx(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvebx(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvebx(__a, __b);
+}
+
+/* vec_lvehx */
+
+static __inline__ vector short __ATTRS_o_ai vec_lvehx(int __a,
+                                                      const short *__b) {
+  return (vector short)__builtin_altivec_lvehx(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvehx(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvehx(__a, __b);
+}
+
+/* vec_lvewx */
+
+static __inline__ vector int __ATTRS_o_ai vec_lvewx(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvewx(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvewx(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvewx(int __a,
+                                                      const float *__b) {
+  return (vector float)__builtin_altivec_lvewx(__a, __b);
+}
+
+/* vec_ldl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_ldl(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_ldl(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_ldl(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_ldl(int __a, const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_ldl(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_ldl(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector pixel *__b) {
+  return (vector pixel short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ldl(int __a,
+                                                  const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_ldl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_ldl(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_ldl(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ldl(int __a,
+                                                    const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_ldl(int __a, const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_lvxl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvxl(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const short *__b) {
+  return (vector short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector pixel *__b) {
+  return (vector pixel)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvxl(int __a,
+                                                   const vector int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvxl(int __a, const int *__b) {
+  return (vector int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvxl(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvxl(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const vector float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvxl(int __a,
+                                                     const float *__b) {
+  return (vector float)__builtin_altivec_lvxl(__a, __b);
+}
+
+/* vec_loge */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_loge(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_vlogefp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vlogefp(vector float __a) {
+  return __builtin_altivec_vlogefp(__a);
+}
+
+/* vec_lvsl */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsl(int __a, const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsl(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsl(int __a,
+                                                             const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsl(__a, __b);
+}
+#endif
+
+/* vec_lvsr */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const signed char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const signed char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned char *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned short *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned short *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const unsigned int *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvsr(int __a, const unsigned int *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector unsigned char __ATTRS_o_ai
+    __attribute__((__deprecated__("use assignment for unaligned little endian \
+loads/stores"))) vec_lvsr(int __a, const float *__b) {
+  vector unsigned char mask =
+      (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+  vector unsigned char reverse = {15, 14, 13, 12, 11, 10, 9, 8,
+                                  7,  6,  5,  4,  3,  2,  1, 0};
+  return vec_perm(mask, mask, reverse);
+}
+#else
+static __inline__ vector unsigned char __ATTRS_o_ai vec_lvsr(int __a,
+                                                             const float *__b) {
+  return (vector unsigned char)__builtin_altivec_lvsr(__a, __b);
+}
+#endif
+
+/* vec_madd */
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector signed short, vector signed short);
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mladd(vector signed short, vector unsigned short, vector unsigned short);
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector signed short, vector signed short);
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short, vector unsigned short, vector unsigned short);
+
+static __inline__ vector signed short __ATTRS_o_ai vec_madd(
+    vector signed short __a, vector signed short __b, vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_madd(vector signed short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector signed short __b,
+         vector signed short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_madd(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return vec_mladd(__a, __b, __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_madd(vector float __a,
+                                                     vector float __b,
+                                                     vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaddasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_madd(vector double __a,
+                                                      vector double __b,
+                                                      vector double __c) {
+  return __builtin_vsx_xvmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vmaddfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vmaddfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vmaddfp(__a, __b, __c);
+}
+
+/* vec_madds */
+
+static __inline__ vector signed short __attribute__((__always_inline__))
+vec_madds(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_vmhaddshs */
+static __inline__ vector signed short __attribute__((__always_inline__))
+vec_vmhaddshs(vector signed short __a, vector signed short __b,
+              vector signed short __c) {
+  return __builtin_altivec_vmhaddshs(__a, __b, __c);
+}
+
+/* vec_msub */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_msub(vector float __a,
+                                                     vector float __b,
+                                                     vector float __c) {
+  return __builtin_vsx_xvmsubasp(__a, __b, __c);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_msub(vector double __a,
+                                                      vector double __b,
+                                                      vector double __c) {
+  return __builtin_vsx_xvmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_max */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_max(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_max(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_max(vector short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_max(vector bool short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_max(vector short __a,
+                                                    vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_max(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_max(vector int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_max(vector bool int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_max(vector int __a,
+                                                  vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_max(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vmaxsd((vector signed long long)__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_max(vector signed long long __a, vector bool long long __b) {
+  return __builtin_altivec_vmaxsd(__a, (vector signed long long)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vmaxud((vector unsigned long long)__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vmaxud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_max(vector float __a,
+                                                    vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_max(vector double __a,
+                                                     vector double __b) {
+  return __builtin_vsx_xvmaxdp(__a, __b);
+}
+#endif
+
+/* vec_vmaxsb */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vmaxsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmaxsb(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxsb(__a, (vector signed char)__b);
+}
+
+/* vec_vmaxub */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vmaxub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmaxub(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vmaxub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vmaxsh */
+
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vmaxsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector bool short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vmaxsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vmaxsh(vector short __a,
+                                                       vector bool short __b) {
+  return __builtin_altivec_vmaxsh(__a, (vector short)__b);
+}
+
+/* vec_vmaxuh */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vmaxuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmaxuh(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vmaxuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vmaxsw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vmaxsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector bool int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vmaxsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vmaxsw(vector int __a,
+                                                     vector bool int __b) {
+  return __builtin_altivec_vmaxsw(__a, (vector int)__b);
+}
+
+/* vec_vmaxuw */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vmaxuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmaxuw(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vmaxuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vmaxfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vmaxfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvmaxsp(__a, __b);
+#else
+  return __builtin_altivec_vmaxfp(__a, __b);
+#endif
+}
+
+/* vec_mergeh */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mergeh(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_mergeh(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mergeh(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_mergeh(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_mergeh(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mergeh(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergeh(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_mergeh(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeh(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_mergeh(vector double __a,
+                                                        vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeh(vector double __a, vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeh(vector bool long long __a, vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                         0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+#endif
+
+/* vec_vmrghb */
+
+#define __builtin_altivec_vmrghb vec_vmrghb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmrghb(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmrghb(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vmrghb(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x10, 0x01, 0x11, 0x02, 0x12,
+                                         0x03, 0x13, 0x04, 0x14, 0x05, 0x15,
+                                         0x06, 0x16, 0x07, 0x17));
+}
+
+/* vec_vmrghh */
+
+#define __builtin_altivec_vmrghh vec_vmrghh
+
+static __inline__ vector short __ATTRS_o_ai vec_vmrghh(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmrghh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vmrghh(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vmrghh(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x10, 0x11, 0x02, 0x03,
+                                         0x12, 0x13, 0x04, 0x05, 0x14, 0x15,
+                                         0x06, 0x07, 0x16, 0x17));
+}
+
+/* vec_vmrghw */
+
+#define __builtin_altivec_vmrghw vec_vmrghw
+
+static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmrghw(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vmrghw(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vmrghw(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
+                                         0x14, 0x15, 0x16, 0x17));
+}
+
+/* vec_mergel */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mergel(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_mergel(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mergel(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_mergel(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_mergel(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mergel(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergel(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_mergel(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergel(vector signed long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector signed long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector signed long long __b) {
+  return vec_perm((vector signed long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector unsigned long long __a, vector bool long long __b) {
+  return vec_perm(__a, (vector unsigned long long)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector unsigned long long __b) {
+  return vec_perm((vector unsigned long long)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector double __ATTRS_o_ai vec_mergel(vector double __a,
+                                                        vector double __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergel(vector double __a, vector bool long long __b) {
+  return vec_perm(__a, (vector double)__b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+static __inline__ vector double __ATTRS_o_ai
+vec_mergel(vector bool long long __a, vector double __b) {
+  return vec_perm((vector double)__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                         0x0E, 0x0F, 0x18, 0X19, 0x1A, 0x1B,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+#endif
+
+/* vec_vmrglb */
+
+#define __builtin_altivec_vmrglb vec_vmrglb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vmrglb(vector signed char __a, vector signed char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vmrglb(vector unsigned char __a, vector unsigned char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vmrglb(vector bool char __a, vector bool char __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x18, 0x09, 0x19, 0x0A, 0x1A,
+                                         0x0B, 0x1B, 0x0C, 0x1C, 0x0D, 0x1D,
+                                         0x0E, 0x1E, 0x0F, 0x1F));
+}
+
+/* vec_vmrglh */
+
+#define __builtin_altivec_vmrglh vec_vmrglh
+
+static __inline__ vector short __ATTRS_o_ai vec_vmrglh(vector short __a,
+                                                       vector short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmrglh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vmrglh(vector bool short __a, vector bool short __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vmrglh(vector pixel __a,
+                                                       vector pixel __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x1C, 0x1D,
+                                         0x0E, 0x0F, 0x1E, 0x1F));
+}
+
+/* vec_vmrglw */
+
+#define __builtin_altivec_vmrglw vec_vmrglw
+
+static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a,
+                                                     vector int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vmrglw(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vmrglw(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vmrglw(vector float __a,
+                                                       vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19,
+                                         0x1A, 0x1B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+#ifdef __POWER8_VECTOR__
+/* vec_mergee */
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergee(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mergee(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergee(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergee(vector bool long long __a, vector bool long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergee(vector signed long long __a, vector signed long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergee(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergeh(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergee(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x10, 0x11,
+                                         0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B,
+                                         0x18, 0x19, 0x1A, 0x1B));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergee(vector double __a, vector double __b) {
+  return vec_mergeh(__a, __b);
+}
+
+/* vec_mergeo */
+
+static __inline__ vector bool int __ATTRS_o_ai vec_mergeo(vector bool int __a,
+                                                          vector bool int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mergeo(vector signed int __a, vector signed int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mergeo(vector unsigned int __a, vector unsigned int __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_mergeo(vector bool long long __a, vector bool long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mergeo(vector signed long long __a, vector signed long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mergeo(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_mergel(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_mergeo(vector float __a, vector float __b) {
+  return vec_perm(__a, __b,
+                  (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x14, 0x15,
+                                         0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F,
+                                         0x1C, 0x1D, 0x1E, 0x1F));
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_mergeo(vector double __a, vector double __b) {
+  return vec_mergel(__a, __b);
+}
+
+#endif
+
+/* vec_mfvscr */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_mfvscr(void) {
+  return __builtin_altivec_mfvscr();
+}
+
+/* vec_min */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_min(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_min(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_min(vector short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_min(vector bool short __a,
+                                                    vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_min(vector short __a,
+                                                    vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_min(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_min(vector int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_min(vector bool int __a,
+                                                  vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_min(vector int __a,
+                                                  vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_min(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector signed long long __b) {
+  return __builtin_altivec_vminsd((vector signed long long)__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_min(vector signed long long __a, vector bool long long __b) {
+  return __builtin_altivec_vminsd(__a, (vector signed long long)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vminud((vector unsigned long long)__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  return __builtin_altivec_vminud(__a, (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_min(vector float __a,
+                                                    vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_min(vector double __a,
+                                                     vector double __b) {
+  return __builtin_vsx_xvmindp(__a, __b);
+}
+#endif
+
+/* vec_vminsb */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vminsb((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vminsb(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vminsb(__a, (vector signed char)__b);
+}
+
+/* vec_vminub */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vminub((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vminub(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vminub(__a, (vector unsigned char)__b);
+}
+
+/* vec_vminsh */
+
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vminsh(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector bool short __a,
+                                                       vector short __b) {
+  return __builtin_altivec_vminsh((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vminsh(vector short __a,
+                                                       vector bool short __b) {
+  return __builtin_altivec_vminsh(__a, (vector short)__b);
+}
+
+/* vec_vminuh */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vminuh((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vminuh(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vminuh(__a, (vector unsigned short)__b);
+}
+
+/* vec_vminsw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vminsw(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector bool int __a,
+                                                     vector int __b) {
+  return __builtin_altivec_vminsw((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vminsw(vector int __a,
+                                                     vector bool int __b) {
+  return __builtin_altivec_vminsw(__a, (vector int)__b);
+}
+
+/* vec_vminuw */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vminuw((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vminuw(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vminuw(__a, (vector unsigned int)__b);
+}
+
+/* vec_vminfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vminfp(vector float __a, vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvminsp(__a, __b);
+#else
+  return __builtin_altivec_vminfp(__a, __b);
+#endif
+}
+
+/* vec_mladd */
+
+#define __builtin_altivec_vmladduhm vec_mladd
+
+static __inline__ vector short __ATTRS_o_ai vec_mladd(vector short __a,
+                                                      vector short __b,
+                                                      vector short __c) {
+  return __a * __b + __c;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mladd(
+    vector short __a, vector unsigned short __b, vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_mladd(vector unsigned short __a,
+                                                      vector short __b,
+                                                      vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_vmladduhm */
+
+static __inline__ vector short __ATTRS_o_ai vec_vmladduhm(vector short __a,
+                                                          vector short __b,
+                                                          vector short __c) {
+  return __a * __b + __c;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vmladduhm(
+    vector short __a, vector unsigned short __b, vector unsigned short __c) {
+  return __a * (vector short)__b + (vector short)__c;
+}
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector short __b, vector short __c) {
+  return (vector short)__a * __b + __c;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vmladduhm(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+/* vec_mradds */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_mradds(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_vmhraddshs */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vmhraddshs(vector short __a, vector short __b, vector short __c) {
+  return __builtin_altivec_vmhraddshs(__a, __b, __c);
+}
+
+/* vec_msum */
+
+static __inline__ vector int __ATTRS_o_ai vec_msum(vector signed char __a,
+                                                   vector unsigned char __b,
+                                                   vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_msum(vector short __a,
+                                                   vector short __b,
+                                                   vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msum(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_vmsummbm */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmsummbm(vector signed char __a, vector unsigned char __b, vector int __c) {
+  return __builtin_altivec_vmsummbm(__a, __b, __c);
+}
+
+/* vec_vmsumubm */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmsumubm(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumubm(__a, __b, __c);
+}
+
+/* vec_vmsumshm */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmsumshm(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshm(__a, __b, __c);
+}
+
+/* vec_vmsumuhm */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhm(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhm(__a, __b, __c);
+}
+
+/* vec_msums */
+
+static __inline__ vector int __ATTRS_o_ai vec_msums(vector short __a,
+                                                    vector short __b,
+                                                    vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_msums(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_vmsumshs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmsumshs(vector short __a, vector short __b, vector int __c) {
+  return __builtin_altivec_vmsumshs(__a, __b, __c);
+}
+
+/* vec_vmsumuhs */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmsumuhs(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned int __c) {
+  return __builtin_altivec_vmsumuhs(__a, __b, __c);
+}
+
+/* vec_mtvscr */
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector signed char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool char __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool short __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector pixel __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector unsigned int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector bool int __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+static __inline__ void __ATTRS_o_ai vec_mtvscr(vector float __a) {
+  __builtin_altivec_mtvscr((vector int)__a);
+}
+
+/* vec_mul */
+
+/* Integer vector multiplication will involve multiplication of the odd/even
+   elements separately, then truncating the results and moving to the
+   result vector.
+*/
+static __inline__ vector signed char __ATTRS_o_ai
+vec_mul(vector signed char __a, vector signed char __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_mul(vector unsigned char __a, vector unsigned char __b) {
+  return __a * __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_mul(vector signed short __a, vector signed short __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mul(vector unsigned short __a, vector unsigned short __b) {
+  return __a * __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_mul(vector signed int __a, vector signed int __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mul(vector unsigned int __a, vector unsigned int __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mul(vector signed long long __a, vector signed long long __b) {
+  return __a * __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mul(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a * __b;
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_mul(vector float __a,
+                                                    vector float __b) {
+  return __a * __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_mul(vector double __a,
+                                                     vector double __b) {
+  return __a * __b;
+}
+#endif
+
+/* The vmulos* and vmules* instructions have a big endian bias, so
+   we must reverse the meaning of "even" and "odd" for little endian.  */
+
+/* vec_mule */
+
+static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a,
+                                                     vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mule(vector short __a,
+                                                   vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mule(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosw(__a, __b);
+#else
+  return __builtin_altivec_vmulesw(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouw(__a, __b);
+#else
+  return __builtin_altivec_vmuleuw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulesb */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vmulesb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosb(__a, __b);
+#else
+  return __builtin_altivec_vmulesb(__a, __b);
+#endif
+}
+
+/* vec_vmuleub */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vmuleub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuloub(__a, __b);
+#else
+  return __builtin_altivec_vmuleub(__a, __b);
+#endif
+}
+
+/* vec_vmulesh */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmulesh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulosh(__a, __b);
+#else
+  return __builtin_altivec_vmulesh(__a, __b);
+#endif
+}
+
+/* vec_vmuleuh */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulouh(__a, __b);
+#else
+  return __builtin_altivec_vmuleuh(__a, __b);
+#endif
+}
+
+/* vec_mulo */
+
+static __inline__ vector short __ATTRS_o_ai vec_mulo(vector signed char __a,
+                                                     vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_mulo(vector short __a,
+                                                   vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_mulo(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesw(__a, __b);
+#else
+  return __builtin_altivec_vmulosw(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuw(__a, __b);
+#else
+  return __builtin_altivec_vmulouw(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vmulosb */
+
+static __inline__ vector short __attribute__((__always_inline__))
+vec_vmulosb(vector signed char __a, vector signed char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesb(__a, __b);
+#else
+  return __builtin_altivec_vmulosb(__a, __b);
+#endif
+}
+
+/* vec_vmuloub */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vmuloub(vector unsigned char __a, vector unsigned char __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleub(__a, __b);
+#else
+  return __builtin_altivec_vmuloub(__a, __b);
+#endif
+}
+
+/* vec_vmulosh */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vmulosh(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmulesh(__a, __b);
+#else
+  return __builtin_altivec_vmulosh(__a, __b);
+#endif
+}
+
+/* vec_vmulouh */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vmulouh(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vmuleuh(__a, __b);
+#else
+  return __builtin_altivec_vmulouh(__a, __b);
+#endif
+}
+
+/*  vec_nand */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector signed char __a, vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector signed char __a, vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nand(vector bool char __a, vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector unsigned char __a, vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nand(vector bool char __a, vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_nand(vector bool char __a,
+                                                         vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector signed short __a, vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector signed short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_nand(vector bool short __a, vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nand(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nand(vector unsigned short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_nand(vector bool short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_nand(vector signed int __a, vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_nand(vector signed int __a,
+                                                          vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_nand(vector bool int __a, vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector unsigned int __a, vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nand(vector bool int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_nand(vector bool int __a,
+                                                        vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_nand(vector float __a, vector float __b) {
+  return (vector float)(~((vector unsigned int)__a &
+                          (vector unsigned int)__b));
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nand(vector signed long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nand(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_nand(vector double __a, vector double __b) {
+  return (vector double)(~((vector unsigned long long)__a &
+                           (vector unsigned long long)__b));
+}
+
+#endif
+
+/* vec_nmadd */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_nmadd(vector float __a,
+                                                      vector float __b,
+                                                      vector float __c) {
+  return __builtin_vsx_xvnmaddasp(__a, __b, __c);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_nmadd(vector double __a,
+                                                       vector double __b,
+                                                       vector double __c) {
+  return __builtin_vsx_xvnmaddadp(__a, __b, __c);
+}
+#endif
+
+/* vec_nmsub */
+
+static __inline__ vector float __ATTRS_o_ai vec_nmsub(vector float __a,
+                                                      vector float __b,
+                                                      vector float __c) {
+#ifdef __VSX__
+  return __builtin_vsx_xvnmsubasp(__a, __b, __c);
+#else
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_nmsub(vector double __a,
+                                                       vector double __b,
+                                                       vector double __c) {
+  return __builtin_vsx_xvnmsubadp(__a, __b, __c);
+}
+#endif
+
+/* vec_vnmsubfp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vnmsubfp(vector float __a, vector float __b, vector float __c) {
+  return __builtin_altivec_vnmsubfp(__a, __b, __c);
+}
+
+/* vec_nor */
+
+#define __builtin_altivec_vnor vec_nor
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_nor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_nor(vector bool char __a,
+                                                        vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_nor(vector short __a,
+                                                    vector short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_nor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_nor(vector int __a,
+                                                  vector int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_nor(vector bool int __a,
+                                                       vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_nor(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_nor(vector double __a,
+                                                     vector double __b) {
+  vector unsigned long long __res =
+      ~((vector unsigned long long)__a | (vector unsigned long long)__b);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vnor */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vnor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vnor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vnor(vector bool char __a,
+                                                         vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vnor(vector short __a,
+                                                     vector short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vnor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vnor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vnor(vector int __a,
+                                                   vector int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vnor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vnor(vector bool int __a,
+                                                        vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vnor(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      ~((vector unsigned int)__a | (vector unsigned int)__b);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_nor(vector bool long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+#endif
+
+/* vec_or */
+
+#define __builtin_altivec_vor vec_or
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_or(vector signed char __a, vector signed char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_or(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a,
+                                                         vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector unsigned char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_or(vector unsigned char __a, vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_or(vector bool char __a,
+                                                       vector bool char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_or(vector short __a,
+                                                   vector short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                   vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_or(vector short __a,
+                                                   vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector unsigned short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_or(vector unsigned short __a, vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_or(vector bool short __a,
+                                                        vector bool short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_or(vector int __a,
+                                                 vector int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_or(vector bool int __a,
+                                                 vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_or(vector int __a,
+                                                 vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector unsigned int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_or(vector unsigned int __a, vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_or(vector bool int __a,
+                                                      vector bool int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
+                                                   vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_or(vector bool int __a,
+                                                   vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
+                                                   vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_or(vector bool long long __a,
+                                                    vector double __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
+                                                    vector bool long long __b) {
+  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
+                                                    vector double __b) {
+  vector unsigned long long __res =
+      (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)__res;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_or(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_or(vector signed long long __a, vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_or(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_or(vector bool long long __a, vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector signed char __a, vector signed char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector signed char __a, vector bool char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_orc(vector bool char __a, vector signed char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector unsigned char __a, vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector unsigned char __a, vector bool char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_orc(vector bool char __a, vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_orc(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector signed short __a, vector signed short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector signed short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector signed short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector unsigned short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector unsigned short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_orc(vector bool short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_orc(vector signed int __a, vector signed int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai vec_orc(vector signed int __a,
+                                                         vector bool int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_orc(vector bool int __a, vector signed int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector unsigned int __a, vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector unsigned int __a, vector bool int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_orc(vector bool int __a, vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_orc(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector bool int __a, vector float __b) {
+ return (vector float)(__a | ~(vector unsigned int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_orc(vector float __a, vector bool int __b) {
+  return (vector float)((vector unsigned int)__a | ~__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_orc(vector signed long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_orc(vector unsigned long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a | ~__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_orc(vector bool long long __a, vector double __b) {
+  return (vector double)(__a | ~(vector unsigned long long)__b);
+}
+#endif
+
+/* vec_vor */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector signed char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a | __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vor(vector signed char __a, vector bool char __b) {
+  return __a | (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector unsigned char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a | __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vor(vector unsigned char __a, vector bool char __b) {
+  return __a | (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vor(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                                    vector short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a | __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vor(vector short __a,
+                                                    vector bool short __b) {
+  return __a | (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector unsigned short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a | __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vor(vector unsigned short __a, vector bool short __b) {
+  return __a | (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vor(vector bool short __a, vector bool short __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector int __a,
+                                                  vector int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a | __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vor(vector int __a,
+                                                  vector bool int __b) {
+  return __a | (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector unsigned int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a | __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vor(vector unsigned int __a, vector bool int __b) {
+  return __a | (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a | __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector bool int __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vor(vector float __a,
+                                                    vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a | (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vor(vector signed long long __a, vector signed long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a | __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vor(vector signed long long __a, vector bool long long __b) {
+  return __a | (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a | __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vor(vector unsigned long long __a, vector bool long long __b) {
+  return __a | (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vor(vector bool long long __a, vector bool long long __b) {
+  return __a | __b;
+}
+#endif
+
+/* vec_pack */
+
+/* The various vector pack instructions have a big-endian bias, so for
+   little endian we must handle reversed element numbering.  */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_pack(vector signed short __a, vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_pack(vector bool short __a, vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_pack(vector int __a,
+                                                     vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_pack(vector bool int __a,
+                                                          vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector signed int __ATTRS_o_ai
+vec_pack(vector signed long long __a, vector signed long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector signed int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_pack(vector bool long long __a, vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_pack(vector double __a, vector double __b) {
+  return (vector float) (__a[0], __a[1], __b[0], __b[1]);
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_pack_to_short_fp32(vector float __a, vector float __b) {
+  vector float __resa = __builtin_vsx_xvcvsphp(__a);
+  vector float __resb = __builtin_vsx_xvcvsphp(__b);
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_mergee(__resa, __resb);
+#else
+  return (vector unsigned short)vec_mergeo(__resa, __resb);
+#endif
+}
+
+#endif
+/* vec_vpkuhum */
+
+#define __builtin_altivec_vpkuhum vec_vpkuhum
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vpkuhum(vector signed short __a, vector signed short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector signed char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vpkuhum(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector unsigned char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vpkuhum(vector bool short __a, vector bool short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
+                             0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E));
+#else
+  return (vector bool char)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F,
+                             0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F));
+#endif
+}
+
+/* vec_vpkuwum */
+
+#define __builtin_altivec_vpkuwum vec_vpkuwum
+
+static __inline__ vector short __ATTRS_o_ai vec_vpkuwum(vector int __a,
+                                                        vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkuwum(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector unsigned short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vpkuwum(vector bool int __a, vector bool int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0C, 0x0D,
+                             0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1C, 0x1D));
+#else
+  return (vector bool short)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x02, 0x03, 0x06, 0x07, 0x0A, 0x0B, 0x0E, 0x0F,
+                             0x12, 0x13, 0x16, 0x17, 0x1A, 0x1B, 0x1E, 0x1F));
+#endif
+}
+
+/* vec_vpkudum */
+
+#ifdef __POWER8_VECTOR__
+#define __builtin_altivec_vpkudum vec_vpkudum
+
+static __inline__ vector int __ATTRS_o_ai vec_vpkudum(vector long long __a,
+                                                      vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vpkudum(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector unsigned int)vec_perm(
+      __a, __b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vpkudum(vector bool long long __a, vector bool long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B,
+                             0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B));
+#else
+  return (vector bool int)vec_perm(
+      (vector long long)__a, (vector long long)__b,
+      (vector unsigned char)(0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F,
+                             0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F));
+#endif
+}
+#endif
+
+/* vec_packpx */
+
+static __inline__ vector pixel __attribute__((__always_inline__))
+vec_packpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_vpkpx */
+
+static __inline__ vector pixel __attribute__((__always_inline__))
+vec_vpkpx(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector pixel)__builtin_altivec_vpkpx(__b, __a);
+#else
+  return (vector pixel)__builtin_altivec_vpkpx(__a, __b);
+#endif
+}
+
+/* vec_packs */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a,
+                                                            vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_packs(vector int __a,
+                                                             vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector int __ATTRS_o_ai vec_packs(vector long long __a,
+                                                    vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshss */
+
+static __inline__ vector signed char __attribute__((__always_inline__))
+vec_vpkshss(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshss(__b, __a);
+#else
+  return __builtin_altivec_vpkshss(__a, __b);
+#endif
+}
+
+/* vec_vpksdss */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector int __ATTRS_o_ai vec_vpksdss(vector long long __a,
+                                                      vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdss(__b, __a);
+#else
+  return __builtin_altivec_vpksdss(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkuhus */
+
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+vec_vpkuhus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkudus */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vpkudus(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkswss */
+
+static __inline__ vector signed short __attribute__((__always_inline__))
+vec_vpkswss(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswss(__b, __a);
+#else
+  return __builtin_altivec_vpkswss(__a, __b);
+#endif
+}
+
+/* vec_vpkuwus */
+
+static __inline__ vector unsigned short __attribute__((__always_inline__))
+vec_vpkuwus(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_packsu */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packsu(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packsu(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_packsu(vector long long __a, vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkudus(__b, __a);
+#else
+  return __builtin_altivec_vpkudus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_vpkshus */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector short __a, vector short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkshus(__b, __a);
+#else
+  return __builtin_altivec_vpkshus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vpkshus(vector unsigned short __a, vector unsigned short __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuhus(__b, __a);
+#else
+  return __builtin_altivec_vpkuhus(__a, __b);
+#endif
+}
+
+/* vec_vpkswus */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkswus(__b, __a);
+#else
+  return __builtin_altivec_vpkswus(__a, __b);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vpkswus(vector unsigned int __a, vector unsigned int __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpkuwus(__b, __a);
+#else
+  return __builtin_altivec_vpkuwus(__a, __b);
+#endif
+}
+
+/* vec_vpksdus */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vpksdus(vector long long __a, vector long long __b) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vpksdus(__b, __a);
+#else
+  return __builtin_altivec_vpksdus(__a, __b);
+#endif
+}
+#endif
+
+/* vec_perm */
+
+// The vperm instruction is defined architecturally with a big-endian bias.
+// For little endian, we swap the input operands and invert the permute
+// control vector.  Only the rightmost 5 bits matter, so we could use
+// a vector of all 31s instead of all 255s to perform the inversion.
+// However, when the PCV is not a constant, using 255 has an advantage
+// in that the vec_xor can be recognized as a vec_nor (and for P8 and
+// later, possibly a vec_nand).
+
+static __inline__ vector signed char __ATTRS_o_ai vec_perm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                         (vector int)__a, __d);
+#else
+  return (vector signed char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                         (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_perm(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__b,
+                                                       (vector int)__a, __d);
+#else
+  return (vector bool char)__builtin_altivec_vperm_4si((vector int)__a,
+                                                       (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_perm(vector signed short __a,
+                                                     vector signed short __b,
+                                                     vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector signed short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned short)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_perm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__b,
+                                                        (vector int)__a, __d);
+#else
+  return (vector bool short)__builtin_altivec_vperm_4si((vector int)__a,
+                                                        (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_perm(vector pixel __a,
+                                                     vector pixel __b,
+                                                     vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector pixel)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_perm(vector signed int __a,
+                                                   vector signed int __b,
+                                                   vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed int)__builtin_altivec_vperm_4si(__b, __a, __d);
+#else
+  return (vector signed int)__builtin_altivec_vperm_4si(__a, __b, __c);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                          (vector int)__a, __d);
+#else
+  return (vector unsigned int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                          (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_perm(vector bool int __a, vector bool int __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__b,
+                                                      (vector int)__a, __d);
+#else
+  return (vector bool int)__builtin_altivec_vperm_4si((vector int)__a,
+                                                      (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_perm(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__b,
+                                                   (vector int)__a, __d);
+#else
+  return (vector float)__builtin_altivec_vperm_4si((vector int)__a,
+                                                   (vector int)__b, __c);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector long long __ATTRS_o_ai
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector signed long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector unsigned long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__b, (vector int)__a, __d);
+#else
+  return (vector bool long long)__builtin_altivec_vperm_4si(
+      (vector int)__a, (vector int)__b, __c);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_perm(vector double __a, vector double __b, vector unsigned char __c) {
+#ifdef __LITTLE_ENDIAN__
+  vector unsigned char __d = {255, 255, 255, 255, 255, 255, 255, 255,
+                              255, 255, 255, 255, 255, 255, 255, 255};
+  __d = vec_xor(__c, __d);
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__b,
+                                                    (vector int)__a, __d);
+#else
+  return (vector double)__builtin_altivec_vperm_4si((vector int)__a,
+                                                    (vector int)__b, __c);
+#endif
+}
+#endif
+
+/* vec_vperm */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_vperm(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vperm(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vperm(
+    vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vperm(vector short __a, vector short __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vperm(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_vperm(
+    vector bool short __a, vector bool short __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai
+vec_vperm(vector pixel __a, vector pixel __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vperm(vector int __a,
+                                                    vector int __b,
+                                                    vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vperm(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vperm(vector bool int __a, vector bool int __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_vperm(vector float __a, vector float __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+#ifdef __VSX__
+static __inline__ vector long long __ATTRS_o_ai vec_vperm(
+    vector long long __a, vector long long __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vperm(vector unsigned long long __a, vector unsigned long long __b,
+          vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_vperm(vector double __a, vector double __b, vector unsigned char __c) {
+  return vec_perm(__a, __b, __c);
+}
+#endif
+
+/* vec_re */
+
+static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvresp(__a);
+#else
+  return __builtin_altivec_vrefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_re(vector double __a) {
+  return __builtin_vsx_xvredp(__a);
+}
+#endif
+
+/* vec_vrefp */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrefp(vector float __a) {
+  return __builtin_altivec_vrefp(__a);
+}
+
+/* vec_rl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_rl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_rl(vector short __a,
+                                                   vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_rl(vector int __a,
+                                                 vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_altivec_vrld(__a, __b);
+}
+#endif
+
+/* vec_rlmi */
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlmi(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  return __builtin_altivec_vrlwmi(__a, __c, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlmi(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  return __builtin_altivec_vrldmi(__a, __c, __b);
+}
+
+/* vec_rlnm */
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_rlnm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int OneByte = { 0x8, 0x8, 0x8, 0x8 };
+  return __builtin_altivec_vrlwnm(__a, ((__c << OneByte) | __b));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_rlnm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned long long __c) {
+  vector unsigned long long OneByte = { 0x8, 0x8 };
+  return __builtin_altivec_vrldnm(__a, ((__c << OneByte) | __b));
+}
+#endif
+
+/* vec_vrlb */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vrlb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vrlb(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vrlb((vector char)__a, __b);
+}
+
+/* vec_vrlh */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vrlh(vector short __a, vector unsigned short __b) {
+  return __builtin_altivec_vrlh(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vrlh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vrlh((vector short)__a, __b);
+}
+
+/* vec_vrlw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vrlw(vector int __a,
+                                                   vector unsigned int __b) {
+  return __builtin_altivec_vrlw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vrlw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vrlw((vector int)__a, __b);
+}
+
+/* vec_round */
+
+static __inline__ vector float __ATTRS_o_ai vec_round(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspi(__a);
+#else
+  return __builtin_altivec_vrfin(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+
+/* vec_rint */
+
+static __inline__ vector float __ATTRS_o_ai vec_rint(vector float __a) {
+  return __builtin_vsx_xvrspic(__a);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_rint(vector double __a) {
+  return __builtin_vsx_xvrdpic(__a);
+}
+
+/* vec_nearbyint */
+
+static __inline__ vector float __ATTRS_o_ai vec_nearbyint(vector float __a) {
+  return __builtin_vsx_xvrspi(__a);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_nearbyint(vector double __a) {
+  return __builtin_vsx_xvrdpi(__a);
+}
+#endif
+
+/* vec_vrfin */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfin(vector float __a) {
+  return __builtin_altivec_vrfin(__a);
+}
+
+/* vec_sqrt */
+
+#ifdef __VSX__
+static __inline__ vector float __ATTRS_o_ai vec_sqrt(vector float __a) {
+  return __builtin_vsx_xvsqrtsp(__a);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sqrt(vector double __a) {
+  return __builtin_vsx_xvsqrtdp(__a);
+}
+#endif
+
+/* vec_rsqrte */
+
+static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrsqrtesp(__a);
+#else
+  return __builtin_altivec_vrsqrtefp(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_rsqrte(vector double __a) {
+  return __builtin_vsx_xvrsqrtedp(__a);
+}
+#endif
+
+/* vec_vrsqrtefp */
+
+static __inline__ __vector float __attribute__((__always_inline__))
+vec_vrsqrtefp(vector float __a) {
+  return __builtin_altivec_vrsqrtefp(__a);
+}
+
+/* vec_sel */
+
+#define __builtin_altivec_vsel_4si vec_sel
+
+static __inline__ vector signed char __ATTRS_o_ai vec_sel(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai vec_sel(
+    vector unsigned char __a, vector unsigned char __b, vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_sel(vector bool char __a,
+                                                        vector bool char __b,
+                                                        vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sel(vector short __a,
+                                                    vector short __b,
+                                                    vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sel(vector short __a,
+                                                    vector short __b,
+                                                    vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_sel(
+    vector bool short __a, vector bool short __b, vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sel(vector int __a,
+                                                  vector int __b,
+                                                  vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sel(vector int __a,
+                                                  vector int __b,
+                                                  vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sel(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_sel(vector bool int __a,
+                                                       vector bool int __b,
+                                                       vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sel(vector float __a,
+                                                    vector float __b,
+                                                    vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sel(vector float __a,
+                                                    vector float __b,
+                                                    vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                           ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
+  vector long long __res = ((vector long long)__a & ~(vector long long)__c) |
+                           ((vector long long)__b & (vector long long)__c);
+  return (vector double)__res;
+}
+#endif
+
+/* vec_vsel */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_vsel(
+    vector signed char __a, vector signed char __b, vector unsigned char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return (__a & ~(vector signed char)__c) | (__b & (vector signed char)__c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsel(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai vec_vsel(
+    vector unsigned char __a, vector unsigned char __b, vector bool char __c) {
+  return (__a & ~(vector unsigned char)__c) | (__b & (vector unsigned char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return (__a & ~(vector bool char)__c) | (__b & (vector bool char)__c);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vsel(vector bool char __a,
+                                                         vector bool char __b,
+                                                         vector bool char __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vsel(vector short __a, vector short __b, vector unsigned short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsel(vector short __a,
+                                                     vector short __b,
+                                                     vector bool short __c) {
+  return (__a & ~(vector short)__c) | (__b & (vector short)__c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsel(vector unsigned short __a, vector unsigned short __b,
+         vector bool short __c) {
+  return (__a & ~(vector unsigned short)__c) |
+         (__b & (vector unsigned short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai vec_vsel(
+    vector bool short __a, vector bool short __b, vector unsigned short __c) {
+  return (__a & ~(vector bool short)__c) | (__b & (vector bool short)__c);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsel(vector int __a,
+                                                   vector int __b,
+                                                   vector unsigned int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsel(vector int __a,
+                                                   vector int __b,
+                                                   vector bool int __c) {
+  return (__a & ~(vector int)__c) | (__b & (vector int)__c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsel(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsel(
+    vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return (__a & ~(vector unsigned int)__c) | (__b & (vector unsigned int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return (__a & ~(vector bool int)__c) | (__b & (vector bool int)__c);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vsel(vector bool int __a,
+                                                        vector bool int __b,
+                                                        vector bool int __c) {
+  return (__a & ~__c) | (__b & __c);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
+                                                     vector float __b,
+                                                     vector unsigned int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
+                                                     vector float __b,
+                                                     vector bool int __c) {
+  vector int __res = ((vector int)__a & ~(vector int)__c) |
+                     ((vector int)__b & (vector int)__c);
+  return (vector float)__res;
+}
+
+/* vec_sl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b) {
+  return __a << (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sl(vector unsigned char __a, vector unsigned char __b) {
+  return __a << __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
+                                                   vector unsigned short __b) {
+  return __a << (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sl(vector unsigned short __a, vector unsigned short __b) {
+  return __a << __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
+                                                 vector unsigned int __b) {
+  return __a << (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sl(vector unsigned int __a, vector unsigned int __b) {
+  return __a << __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sl(vector signed long long __a, vector unsigned long long __b) {
+  return __a << (vector long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a << __b;
+}
+#endif
+
+/* vec_vslb */
+
+#define __builtin_altivec_vslb vec_vslb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslb(vector signed char __a, vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslb(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslh */
+
+#define __builtin_altivec_vslh vec_vslh
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vslh(vector short __a, vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslh(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_vslw */
+
+#define __builtin_altivec_vslw vec_vslw
+
+static __inline__ vector int __ATTRS_o_ai vec_vslw(vector int __a,
+                                                   vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslw(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sl(__a, __b);
+}
+
+/* vec_sld */
+
+#define __builtin_altivec_vsldoi_4si vec_sld
+
+static __inline__ vector signed char __ATTRS_o_ai vec_sld(
+    vector signed char __a, vector signed char __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sld(vector unsigned char __a, vector unsigned char __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sld(vector bool char __a, vector bool char __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_sld(
+    vector signed short __a, vector signed short __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sld(vector unsigned short __a, vector unsigned short __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sld(vector bool short __a, vector bool short __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sld(vector pixel __a,
+                                                    vector pixel __b,
+                                                    unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sld(vector signed int __a, vector signed int __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sld(
+    vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_sld(vector bool int __a,
+                                                       vector bool int __b,
+                                                       unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sld(vector float __a,
+                                                    vector float __b,
+                                                    unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_sld(vector bool long long __a, vector bool long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sld(vector signed long long __a, vector signed long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sld(vector unsigned long long __a, vector unsigned long long __b,
+        unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sld(vector double __a,
+                                                     vector double __b,
+                                                     unsigned const int __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+#endif
+
+/* vec_sldw */
+static __inline__ vector signed char __ATTRS_o_ai vec_sldw(
+    vector signed char __a, vector signed char __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sldw(vector unsigned char __a, vector unsigned char __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai vec_sldw(
+    vector signed short __a, vector signed short __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sldw(vector unsigned short __a, vector unsigned short __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sldw(vector signed int __a, vector signed int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_sldw(
+    vector unsigned int __a, vector unsigned int __b, unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sldw(vector signed long long __a, vector signed long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b,
+         unsigned const int __c) {
+  return vec_sld(__a, __b, ((__c << 2) & 0x0F));
+}
+#endif
+
+#ifdef __POWER9_VECTOR__
+/* vec_slv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vslv(__a, __b);
+}
+
+/* vec_srv */
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srv(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsrv(__a, __b);
+}
+#endif
+
+/* vec_vsldoi */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsldoi(vector signed char __a, vector signed char __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai vec_vsldoi(
+    vector unsigned char __a, vector unsigned char __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsldoi(vector short __a,
+                                                       vector short __b,
+                                                       unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai vec_vsldoi(
+    vector unsigned short __a, vector unsigned short __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsldoi(vector pixel __a,
+                                                       vector pixel __b,
+                                                       unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsldoi(vector int __a,
+                                                     vector int __b,
+                                                     unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_vsldoi(
+    vector unsigned int __a, vector unsigned int __b, unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsldoi(vector float __a,
+                                                       vector float __b,
+                                                       unsigned char __c) {
+  unsigned char __d = __c & 0x0F;
+#ifdef __LITTLE_ENDIAN__
+  return vec_perm(
+      __b, __a, (vector unsigned char)(16 - __d, 17 - __d, 18 - __d, 19 - __d,
+                                       20 - __d, 21 - __d, 22 - __d, 23 - __d,
+                                       24 - __d, 25 - __d, 26 - __d, 27 - __d,
+                                       28 - __d, 29 - __d, 30 - __d, 31 - __d));
+#else
+  return vec_perm(
+      __a, __b,
+      (vector unsigned char)(__d, __d + 1, __d + 2, __d + 3, __d + 4, __d + 5,
+                             __d + 6, __d + 7, __d + 8, __d + 9, __d + 10,
+                             __d + 11, __d + 12, __d + 13, __d + 14, __d + 15));
+#endif
+}
+
+/* vec_sll */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sll(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_sll(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sll(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sll(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sll(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_sll(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsl((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsl((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
+/* vec_vsl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsl((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsl(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsl((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsl((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsl(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsl((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsl((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsl(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsl((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsl(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsl(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsl((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsl((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_slo */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_slo(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_slo(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                                    vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_slo(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_slo(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                                    vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_slo(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_slo(vector int __a,
+                                                  vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_slo(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_slo(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                                    vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_slo(vector float __a,
+                                                    vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_slo(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vslo((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_slo(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vslo((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
+/* vec_vslo */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vslo(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vslo((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vslo(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vslo((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                                     vector signed char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vslo(vector short __a,
+                                                     vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vslo(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vslo((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                                     vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vslo(vector pixel __a,
+                                                     vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                                   vector signed char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vslo(vector int __a,
+                                                   vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vslo(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vslo(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vslo((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                                     vector signed char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vslo(vector float __a,
+                                                     vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vslo((vector int)__a, (vector int)__b);
+}
+
+/* vec_splat */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_splat(vector signed char __a, unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splat(vector unsigned char __a, unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_splat(vector bool char __a, unsigned const int __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b & 0x0F));
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_splat(vector signed short __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splat(vector unsigned short __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_splat(vector bool short __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_splat(vector pixel __a,
+                                                      unsigned const int __b) {
+  unsigned char b0 = (__b & 0x07) * 2;
+  unsigned char b1 = b0 + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b0, b1, b0, b1, b0, b1, b0, b1,
+                                         b0, b1, b0, b1, b0, b1));
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_splat(vector signed int __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splat(vector unsigned int __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_splat(vector bool int __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_splat(vector float __a,
+                                                      unsigned const int __b) {
+  unsigned char b0 = (__b & 0x03) * 4;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b0, b1, b2, b3, b0, b1,
+                                         b2, b3, b0, b1, b2, b3));
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_splat(vector double __a,
+                                                       unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_splat(vector bool long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_splat(vector signed long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_splat(vector unsigned long long __a, unsigned const int __b) {
+  unsigned char b0 = (__b & 0x01) * 8;
+  unsigned char b1 = b0 + 1, b2 = b0 + 2, b3 = b0 + 3, b4 = b0 + 4, b5 = b0 + 5,
+                b6 = b0 + 6, b7 = b0 + 7;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(b0, b1, b2, b3, b4, b5, b6, b7, b0, b1,
+                                         b2, b3, b4, b5, b6, b7));
+}
+#endif
+
+/* vec_vspltb */
+
+#define __builtin_altivec_vspltb vec_vspltb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vspltb(vector signed char __a, unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vspltb(vector unsigned char __a, unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vspltb(vector bool char __a,
+                                                           unsigned char __b) {
+  return vec_perm(__a, __a, (vector unsigned char)(__b));
+}
+
+/* vec_vsplth */
+
+#define __builtin_altivec_vsplth vec_vsplth
+
+static __inline__ vector short __ATTRS_o_ai vec_vsplth(vector short __a,
+                                                       unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsplth(vector unsigned short __a, unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsplth(vector bool short __a, unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsplth(vector pixel __a,
+                                                       unsigned char __b) {
+  __b *= 2;
+  unsigned char b1 = __b + 1;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, __b, b1, __b, b1, __b, b1,
+                                         __b, b1, __b, b1, __b, b1, __b, b1));
+}
+
+/* vec_vspltw */
+
+#define __builtin_altivec_vspltw vec_vspltw
+
+static __inline__ vector int __ATTRS_o_ai vec_vspltw(vector int __a,
+                                                     unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vspltw(vector unsigned int __a, unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vspltw(vector bool int __a,
+                                                          unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vspltw(vector float __a,
+                                                       unsigned char __b) {
+  __b *= 4;
+  unsigned char b1 = __b + 1, b2 = __b + 2, b3 = __b + 3;
+  return vec_perm(__a, __a,
+                  (vector unsigned char)(__b, b1, b2, b3, __b, b1, b2, b3, __b,
+                                         b1, b2, b3, __b, b1, b2, b3));
+}
+
+/* vec_splat_s8 */
+
+#define __builtin_altivec_vspltisb vec_splat_s8
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector signed char __ATTRS_o_ai
+vec_splat_s8(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_vspltisb */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vspltisb(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+/* vec_splat_s16 */
+
+#define __builtin_altivec_vspltish vec_splat_s16
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_vspltish */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector short __ATTRS_o_ai vec_vspltish(signed char __a) {
+  return (vector short)(__a);
+}
+
+/* vec_splat_s32 */
+
+#define __builtin_altivec_vspltisw vec_splat_s32
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_vspltisw */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector int __ATTRS_o_ai vec_vspltisw(signed char __a) {
+  return (vector int)(__a);
+}
+
+/* vec_splat_u8 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splat_u8(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+/* vec_splat_u16 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splat_u16(signed char __a) {
+  return (vector unsigned short)(__a);
+}
+
+/* vec_splat_u32 */
+
+// FIXME: parameter should be treated as 5-bit signed literal
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splat_u32(signed char __a) {
+  return (vector unsigned int)(__a);
+}
+
+/* vec_sr */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sr(vector signed char __a, vector unsigned char __b) {
+  vector unsigned char __res = (vector unsigned char)__a >> __b;
+  return (vector signed char)__res;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sr(vector unsigned char __a, vector unsigned char __b) {
+  return __a >> __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_sr(vector signed short __a, vector unsigned short __b) {
+  vector unsigned short __res = (vector unsigned short)__a >> __b;
+  return (vector signed short)__res;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sr(vector unsigned short __a, vector unsigned short __b) {
+  return __a >> __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sr(vector signed int __a, vector unsigned int __b) {
+  vector unsigned int __res = (vector unsigned int)__a >> __b;
+  return (vector signed int)__res;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sr(vector unsigned int __a, vector unsigned int __b) {
+  return __a >> __b;
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sr(vector signed long long __a, vector unsigned long long __b) {
+  vector unsigned long long __res = (vector unsigned long long)__a >> __b;
+  return (vector signed long long)__res;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+#endif
+
+/* vec_vsrb */
+
+#define __builtin_altivec_vsrb vec_vsrb
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsrb(vector signed char __a, vector unsigned char __b) {
+  return __a >> (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsrb(vector unsigned char __a, vector unsigned char __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrh */
+
+#define __builtin_altivec_vsrh vec_vsrh
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vsrh(vector short __a, vector unsigned short __b) {
+  return __a >> (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsrh(vector unsigned short __a, vector unsigned short __b) {
+  return __a >> __b;
+}
+
+/* vec_vsrw */
+
+#define __builtin_altivec_vsrw vec_vsrw
+
+static __inline__ vector int __ATTRS_o_ai vec_vsrw(vector int __a,
+                                                   vector unsigned int __b) {
+  return __a >> (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsrw(vector unsigned int __a, vector unsigned int __b) {
+  return __a >> __b;
+}
+
+/* vec_sra */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sra(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sra(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sra(vector short __a,
+                                                    vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sra(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sra(vector int __a,
+                                                  vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sra(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sra(vector signed long long __a, vector unsigned long long __b) {
+  return __a >> __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)((vector signed long long)__a >> __b);
+}
+#endif
+
+/* vec_vsrab */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsrab(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsrab(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsrab((vector char)__a, __b);
+}
+
+/* vec_vsrah */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vsrah(vector short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsrah(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsrah(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsrah((vector short)__a, __b);
+}
+
+/* vec_vsraw */
+
+static __inline__ vector int __ATTRS_o_ai vec_vsraw(vector int __a,
+                                                    vector unsigned int __b) {
+  return __builtin_altivec_vsraw(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsraw(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsraw((vector int)__a, __b);
+}
+
+/* vec_srl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_srl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_srl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_srl(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_srl(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_srl(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_srl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsr((vector int)__a,
+                                                        (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsr((vector int)__a,
+                                                          (vector int)__b);
+}
+#endif
+
+/* vec_vsr */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsr(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_altivec_vsr((vector int)__a,
+                                                   (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned short __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsr(vector unsigned char __a, vector unsigned int __b) {
+  return (vector unsigned char)__builtin_altivec_vsr((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsr(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_altivec_vsr((vector int)__a,
+                                                 (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned short __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsr(vector short __a,
+                                                    vector unsigned int __b) {
+  return (vector short)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsr(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_altivec_vsr((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsr(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_altivec_vsr((vector int)__a,
+                                                  (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned short __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsr(vector pixel __a,
+                                                    vector unsigned int __b) {
+  return (vector pixel)__builtin_altivec_vsr((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                                  vector unsigned short __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsr(vector int __a,
+                                                  vector unsigned int __b) {
+  return (vector int)__builtin_altivec_vsr(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsr(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_altivec_vsr((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsr(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_altivec_vsr((vector int)__a,
+                                                (vector int)__b);
+}
+
+/* vec_sro */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sro(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sro(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                                    vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sro(vector short __a,
+                                                    vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sro(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                                    vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_sro(vector pixel __a,
+                                                    vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sro(vector int __a,
+                                                  vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sro(vector int __a,
+                                                  vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sro(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                                    vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_sro(vector float __a,
+                                                    vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector signed char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sro(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_altivec_vsro((vector int)__a,
+                                                         (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector signed char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sro(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_altivec_vsro((vector int)__a,
+                                                           (vector int)__b);
+}
+#endif
+
+/* vec_vsro */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsro(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_altivec_vsro((vector int)__a,
+                                                    (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector signed char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsro(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)__builtin_altivec_vsro((vector int)__a,
+                                                      (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                                     vector signed char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsro(vector short __a,
+                                                     vector unsigned char __b) {
+  return (vector short)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector signed char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsro(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_altivec_vsro((vector int)__a,
+                                                       (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                                     vector signed char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_vsro(vector pixel __a,
+                                                     vector unsigned char __b) {
+  return (vector pixel)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                                   vector signed char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsro(vector int __a,
+                                                   vector unsigned char __b) {
+  return (vector int)__builtin_altivec_vsro(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector signed char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsro(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_altivec_vsro((vector int)__a,
+                                                     (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                                     vector signed char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsro(vector float __a,
+                                                     vector unsigned char __b) {
+  return (vector float)__builtin_altivec_vsro((vector int)__a, (vector int)__b);
+}
+
+/* vec_st */
+
+static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                           vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b,
+                                           signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                           vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned char __a, int __b,
+                                           unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool char __a, int __b,
+                                           vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector short __a, int __b,
+                                           vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector short __a, int __b,
+                                           short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                           vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned short __a, int __b,
+                                           unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool short __a, int __b,
+                                           vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector pixel __a, int __b,
+                                           vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector int __a, int __b,
+                                           vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                           vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector unsigned int __a, int __b,
+                                           unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector bool int __a, int __b,
+                                           vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector float __a, int __b,
+                                           vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_st(vector float __a, int __b,
+                                           float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_stvx */
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                             vector signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector signed char __a, int __b,
+                                             signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                             vector unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned char __a, int __b,
+                                             unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             signed char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             unsigned char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool char __a, int __b,
+                                             vector bool char *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                             vector short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector short __a, int __b,
+                                             short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                             vector unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned short __a, int __b,
+                                             unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool short __a, int __b,
+                                             vector bool short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             unsigned short *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector pixel __a, int __b,
+                                             vector pixel *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector int __a, int __b,
+                                             vector int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector int __a, int __b,
+                                             int *__c) {
+  __builtin_altivec_stvx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                             vector unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector unsigned int __a, int __b,
+                                             unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             unsigned int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector bool int __a, int __b,
+                                             vector bool int *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                             vector float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvx(vector float __a, int __b,
+                                             float *__c) {
+  __builtin_altivec_stvx((vector int)__a, __b, __c);
+}
+
+/* vec_ste */
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector signed char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector pixel __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector unsigned int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                            int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector bool int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_ste(vector float __a, int __b,
+                                            float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stvebx */
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector signed char __a, int __b,
+                                               signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                               signed char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvebx(vector bool char __a, int __b,
+                                               unsigned char *__c) {
+  __builtin_altivec_stvebx((vector char)__a, __b, __c);
+}
+
+/* vec_stvehx */
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector short __a, int __b,
+                                               short *__c) {
+  __builtin_altivec_stvehx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                               short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector bool short __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                               short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvehx(vector pixel __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_altivec_stvehx((vector short)__a, __b, __c);
+}
+
+/* vec_stvewx */
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector int __a, int __b,
+                                               int *__c) {
+  __builtin_altivec_stvewx(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                               int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector bool int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvewx(vector float __a, int __b,
+                                               float *__c) {
+  __builtin_altivec_stvewx((vector int)__a, __b, __c);
+}
+
+/* vec_stl */
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                            vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector signed char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                            vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool char __a, int __b,
+                                            vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector short __a, int __b,
+                                            vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                            vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool short __a, int __b,
+                                            vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector pixel __a, int __b,
+                                            vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector int __a, int __b,
+                                            vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector int __a, int __b, int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                            vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector unsigned int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector bool int __a, int __b,
+                                            vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector float __a, int __b,
+                                            vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stl(vector float __a, int __b,
+                                            float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_stvxl */
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                              vector signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector signed char __a, int __b,
+                                              signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              signed char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              unsigned char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool char __a, int __b,
+                                              vector bool char *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                              vector short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector short __a, int __b,
+                                              short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool short __a, int __b,
+                                              vector bool short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              unsigned short *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector pixel __a, int __b,
+                                              vector pixel *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector int __a, int __b,
+                                              vector int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector int __a, int __b,
+                                              int *__c) {
+  __builtin_altivec_stvxl(__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              unsigned int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector bool int __a, int __b,
+                                              vector bool int *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                              vector float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvxl(vector float __a, int __b,
+                                              float *__c) {
+  __builtin_altivec_stvxl((vector int)__a, __b, __c);
+}
+
+/* vec_sub */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector signed char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sub(vector signed char __a, vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector unsigned char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_sub(vector unsigned char __a, vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                                    vector short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_sub(vector short __a,
+                                                    vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector unsigned short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_sub(vector unsigned short __a, vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector int __a,
+                                                  vector int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sub(vector int __a,
+                                                  vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector unsigned int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sub(vector unsigned int __a, vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sub(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sub(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_sub(vector signed long long __a, vector signed long long __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_sub(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a - __b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_sub(vector double __a,
+                                                     vector double __b) {
+  return __a - __b;
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_sub(vector float __a,
+                                                    vector float __b) {
+  return __a - __b;
+}
+
+/* vec_vsububm */
+
+#define __builtin_altivec_vsububm vec_vsububm
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector signed char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a - __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsububm(vector signed char __a, vector bool char __b) {
+  return __a - (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector unsigned char __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a - __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububm(vector unsigned char __a, vector bool char __b) {
+  return __a - (vector unsigned char)__b;
+}
+
+/* vec_vsubuhm */
+
+#define __builtin_altivec_vsubuhm vec_vsubuhm
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                                        vector short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector bool short __a,
+                                                        vector short __b) {
+  return (vector short)__a - __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubuhm(vector short __a,
+                                                        vector bool short __b) {
+  return __a - (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector unsigned short __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a - __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhm(vector unsigned short __a, vector bool short __b) {
+  return __a - (vector unsigned short)__b;
+}
+
+/* vec_vsubuwm */
+
+#define __builtin_altivec_vsubuwm vec_vsubuwm
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                                      vector int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector bool int __a,
+                                                      vector int __b) {
+  return (vector int)__a - __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubuwm(vector int __a,
+                                                      vector bool int __b) {
+  return __a - (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector unsigned int __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a - __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuwm(vector unsigned int __a, vector bool int __b) {
+  return __a - (vector unsigned int)__b;
+}
+
+/* vec_vsubfp */
+
+#define __builtin_altivec_vsubfp vec_vsubfp
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vsubfp(vector float __a, vector float __b) {
+  return __a - __b;
+}
+
+/* vec_subc */
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subc(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_altivec_vsubcuw((vector unsigned int)__a,
+                                                      (vector unsigned int) __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_subc(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_subc(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+/* vec_vsubcuw */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vsubcuw(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubcuw(__a, __b);
+}
+
+/* vec_subs */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_subs(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_subs(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector bool short __a,
+                                                     vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_subs(vector short __a,
+                                                     vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_subs(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector bool int __a,
+                                                   vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_subs(vector int __a,
+                                                   vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subs(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+/* vec_vsubsbs */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector bool char __a, vector signed char __b) {
+  return __builtin_altivec_vsubsbs((vector signed char)__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsubsbs(vector signed char __a, vector bool char __b) {
+  return __builtin_altivec_vsubsbs(__a, (vector signed char)__b);
+}
+
+/* vec_vsububs */
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector bool char __a, vector unsigned char __b) {
+  return __builtin_altivec_vsububs((vector unsigned char)__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsububs(vector unsigned char __a, vector bool char __b) {
+  return __builtin_altivec_vsububs(__a, (vector unsigned char)__b);
+}
+
+/* vec_vsubshs */
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vsubshs(__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector bool short __a,
+                                                        vector short __b) {
+  return __builtin_altivec_vsubshs((vector short)__a, __b);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vsubshs(vector short __a,
+                                                        vector bool short __b) {
+  return __builtin_altivec_vsubshs(__a, (vector short)__b);
+}
+
+/* vec_vsubuhs */
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector bool short __a, vector unsigned short __b) {
+  return __builtin_altivec_vsubuhs((vector unsigned short)__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsubuhs(vector unsigned short __a, vector bool short __b) {
+  return __builtin_altivec_vsubuhs(__a, (vector unsigned short)__b);
+}
+
+/* vec_vsubsws */
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vsubsws(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector bool int __a,
+                                                      vector int __b) {
+  return __builtin_altivec_vsubsws((vector int)__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vsubsws(vector int __a,
+                                                      vector bool int __b) {
+  return __builtin_altivec_vsubsws(__a, (vector int)__b);
+}
+
+/* vec_vsubuws */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector bool int __a, vector unsigned int __b) {
+  return __builtin_altivec_vsubuws((vector unsigned int)__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsubuws(vector unsigned int __a, vector bool int __b) {
+  return __builtin_altivec_vsubuws(__a, (vector unsigned int)__b);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+/* vec_vsubuqm */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubuqm(vector signed __int128 __a, vector signed __int128 __b) {
+  return __a - __b;
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubuqm(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __a - __b;
+}
+
+/* vec_vsubeuqm */
+
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubeuqm(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_sube(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_sube(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubeuqm(__a, __b, __c);
+}
+
+/* vec_vsubcuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubcuq(vector signed __int128 __a, vector signed __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubcuq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
+  return __builtin_altivec_vsubcuq(__a, __b);
+}
+
+/* vec_vsubecuq */
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_vsubecuq(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_vsubecuq(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_subec(vector signed int __a, vector signed int __b,
+             vector signed int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_subec(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c) {
+  return vec_addec(__a, ~__b, __c);
+}
+
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_subec(vector signed __int128 __a, vector signed __int128 __b,
+             vector signed __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_subec(vector unsigned __int128 __a, vector unsigned __int128 __b,
+             vector unsigned __int128 __c) {
+  return __builtin_altivec_vsubecuq(__a, __b, __c);
+}
+#endif // defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_sube(vector signed int __a, vector signed int __b,
+         vector signed int __c) {
+  vector signed int __mask = {1, 1, 1, 1};
+  vector signed int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sube(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned int __c) {
+  vector unsigned int __mask = {1, 1, 1, 1};
+  vector unsigned int __carry = __c & __mask;
+  return vec_adde(__a, ~__b, __carry);
+}
+/* vec_sum4s */
+
+static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a,
+                                                    vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_sum4s(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed short __a,
+                                                    vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_vsum4sbs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vsum4sbs(vector signed char __a, vector int __b) {
+  return __builtin_altivec_vsum4sbs(__a, __b);
+}
+
+/* vec_vsum4ubs */
+
+static __inline__ vector unsigned int __attribute__((__always_inline__))
+vec_vsum4ubs(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_altivec_vsum4ubs(__a, __b);
+}
+
+/* vec_vsum4shs */
+
+static __inline__ vector int __attribute__((__always_inline__))
+vec_vsum4shs(vector signed short __a, vector int __b) {
+  return __builtin_altivec_vsum4shs(__a, __b);
+}
+
+/* vec_sum2s */
+
+/* The vsum2sws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian elements
+   1 and 3 (little-endian element 0 and 2).  For ease of porting the
+   programmer wants elements 1 and 3 in both cases, so for little
+   endian we must perform some permutes.  */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_sum2s(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_vsum2sws */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_vsum2sws(vector int __a, vector int __b) {
+#ifdef __LITTLE_ENDIAN__
+  vector int __c = (vector signed int)vec_perm(
+      __b, __b, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+  __c = __builtin_altivec_vsum2sws(__a, __c);
+  return (vector signed int)vec_perm(
+      __c, __c, (vector unsigned char)(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15,
+                                       8, 9, 10, 11));
+#else
+  return __builtin_altivec_vsum2sws(__a, __b);
+#endif
+}
+
+/* vec_sums */
+
+/* The vsumsws instruction has a big-endian bias, so that the second
+   input vector and the result always reference big-endian element 3
+   (little-endian element 0).  For ease of porting the programmer
+   wants element 3 in both cases, so for little endian we must perform
+   some permutes.  */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_sums(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_vsumsws */
+
+static __inline__ vector signed int __attribute__((__always_inline__))
+vec_vsumsws(vector signed int __a, vector signed int __b) {
+#ifdef __LITTLE_ENDIAN__
+  __b = (vector signed int)vec_splat(__b, 3);
+  __b = __builtin_altivec_vsumsws(__a, __b);
+  return (vector signed int)(0, 0, 0, __b[0]);
+#else
+  return __builtin_altivec_vsumsws(__a, __b);
+#endif
+}
+
+/* vec_trunc */
+
+static __inline__ vector float __ATTRS_o_ai vec_trunc(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvrspiz(__a);
+#else
+  return __builtin_altivec_vrfiz(__a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ vector double __ATTRS_o_ai vec_trunc(vector double __a) {
+  return __builtin_vsx_xvrdpiz(__a);
+}
+#endif
+
+/* vec_vrfiz */
+
+static __inline__ vector float __attribute__((__always_inline__))
+vec_vrfiz(vector float __a) {
+  return __builtin_altivec_vrfiz(__a);
+}
+
+/* vec_unpackh */
+
+/* The vector unpack instructions all have a big-endian bias, so for
+   little endian we must reverse the meanings of "high" and "low."  */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_unpackh(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_unpackh(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_unpackh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_unpackh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unpackh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_unpackh(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_unpackh(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackh(vector float __a) {
+  return (vector double)(__a[0], __a[1]);
+}
+#endif
+
+/* vec_vupkhsb */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vupkhsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsb((vector char)__a);
+#else
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vupkhsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#endif
+}
+
+/* vec_vupkhsh */
+
+static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsh(__a);
+#else
+  return __builtin_altivec_vupkhsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vupkhsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vupkhsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#endif
+}
+
+/* vec_vupkhsw */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_vupkhsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupklsw(__a);
+#else
+  return __builtin_altivec_vupkhsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vupkhsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_unpackl */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_unpackl(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_unpackl(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_unpackl(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_unpackl(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_unpackl(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_unpackl(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_unpackl(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_unpackl(vector float __a) {
+  return (vector double)(__a[2], __a[3]);
+}
+#endif
+
+/* vec_vupklsb */
+
+static __inline__ vector short __ATTRS_o_ai
+vec_vupklsb(vector signed char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return __builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vupklsb(vector bool char __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool short)__builtin_altivec_vupkhsb((vector char)__a);
+#else
+  return (vector bool short)__builtin_altivec_vupklsb((vector char)__a);
+#endif
+}
+
+/* vec_vupklsh */
+
+static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsh(__a);
+#else
+  return __builtin_altivec_vupklsh(__a);
+#endif
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vupklsh(vector bool short __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool int)__builtin_altivec_vupkhsh((vector short)__a);
+#else
+  return (vector bool int)__builtin_altivec_vupklsh((vector short)__a);
+#endif
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vupklsh(vector pixel __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector unsigned int)__builtin_altivec_vupkhpx((vector short)__a);
+#else
+  return (vector unsigned int)__builtin_altivec_vupklpx((vector short)__a);
+#endif
+}
+
+/* vec_vupklsw */
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector long long __ATTRS_o_ai vec_vupklsw(vector int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return __builtin_altivec_vupkhsw(__a);
+#else
+  return __builtin_altivec_vupklsw(__a);
+#endif
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vupklsw(vector bool int __a) {
+#ifdef __LITTLE_ENDIAN__
+  return (vector bool long long)__builtin_altivec_vupkhsw((vector int)__a);
+#else
+  return (vector bool long long)__builtin_altivec_vupklsw((vector int)__a);
+#endif
+}
+#endif
+
+/* vec_vsx_ld */
+
+#ifdef __VSX__
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool int *__b) {
+  return (vector bool int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed int *__b) {
+  return (vector signed int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned int *__b) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vsx_ld(int __a,
+                                                       const float *__b) {
+  return (vector float)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed long long *__b) {
+  return (vector signed long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned long long *__b) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_vsx_ld(int __a, const double *__b) {
+  return (vector double)__builtin_vsx_lxvd2x(__a, __b);
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool short *__b) {
+  return (vector bool short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed short *__b) {
+  return (vector signed short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed short *__b) {
+  return (vector signed short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned short *__b) {
+  return (vector unsigned short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned short *__b) {
+  return (vector unsigned short)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector bool char *__b) {
+  return (vector bool char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector signed char *__b) {
+  return (vector signed char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vsx_ld(int __a, const signed char *__b) {
+  return (vector signed char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsx_ld(int __a, const vector unsigned char *__b) {
+  return (vector unsigned char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vsx_ld(int __a, const unsigned char *__b) {
+  return (vector unsigned char)__builtin_vsx_lxvw4x(__a, __b);
+}
+
+#endif
+
+/* vec_vsx_st */
+
+#ifdef __VSX__
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               vector bool int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                               vector signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed int __a, int __b,
+                                               signed int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                               vector float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector float __a, int __b,
+                                               float *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed long long __a,
+                                               int __b,
+                                               vector signed long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned long long __a,
+                                               int __b,
+                                               vector unsigned long long *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                               vector double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector double __a, int __b,
+                                               double *__c) {
+  __builtin_vsx_stxvd2x((vector double)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool short __a, int __b,
+                                               unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed short __a, int __b,
+                                               vector signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed short __a, int __b,
+                                               signed short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector bool char __a, int __b,
+                                               unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed char __a, int __b,
+                                               vector signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector signed char __a, int __b,
+                                               signed char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  __builtin_vsx_stxvw4x((vector int)__a, __b, __c);
+}
+
+#endif
+
+/* vec_xor */
+
+#define __builtin_altivec_vxor vec_xor
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector signed char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xor(vector signed char __a, vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xor(vector unsigned char __a, vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_xor(vector bool char __a,
+                                                        vector bool char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                                    vector short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector bool short __a,
+                                                    vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_xor(vector short __a,
+                                                    vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xor(vector unsigned short __a, vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_xor(vector bool short __a, vector bool short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector int __a,
+                                                  vector int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                  vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_xor(vector int __a,
+                                                  vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xor(vector unsigned int __a, vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                       vector bool int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector bool int __a,
+                                                    vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_xor(vector float __a,
+                                                    vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xor(vector signed long long __a, vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_xor(vector bool long long __a, vector bool long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xor(vector double __a,
+                                                     vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_xor(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+
+static __inline__ vector double __ATTRS_o_ai vec_xor(vector bool long long __a,
+                                                     vector double __b) {
+  return (vector double)((vector unsigned long long)__a ^
+                         (vector unsigned long long)__b);
+}
+#endif
+
+/* vec_vxor */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector signed char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector signed char __b) {
+  return (vector signed char)__a ^ __b;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vxor(vector signed char __a, vector bool char __b) {
+  return __a ^ (vector signed char)__b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector unsigned char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector bool char __a, vector unsigned char __b) {
+  return (vector unsigned char)__a ^ __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vxor(vector unsigned char __a, vector bool char __b) {
+  return __a ^ (vector unsigned char)__b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_vxor(vector bool char __a,
+                                                         vector bool char __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                                     vector short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector bool short __a,
+                                                     vector short __b) {
+  return (vector short)__a ^ __b;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_vxor(vector short __a,
+                                                     vector bool short __b) {
+  return __a ^ (vector short)__b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector unsigned short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector unsigned short __b) {
+  return (vector unsigned short)__a ^ __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_vxor(vector unsigned short __a, vector bool short __b) {
+  return __a ^ (vector unsigned short)__b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_vxor(vector bool short __a, vector bool short __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector int __a,
+                                                   vector int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                   vector int __b) {
+  return (vector int)__a ^ __b;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_vxor(vector int __a,
+                                                   vector bool int __b) {
+  return __a ^ (vector int)__b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector unsigned int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector bool int __a, vector unsigned int __b) {
+  return (vector unsigned int)__a ^ __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_vxor(vector unsigned int __a, vector bool int __b) {
+  return __a ^ (vector unsigned int)__b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                        vector bool int __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector bool int __a,
+                                                     vector float __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_vxor(vector float __a,
+                                                     vector bool int __b) {
+  vector unsigned int __res =
+      (vector unsigned int)__a ^ (vector unsigned int)__b;
+  return (vector float)__res;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector signed long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector signed long long __b) {
+  return (vector signed long long)__a ^ __b;
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_vxor(vector signed long long __a, vector bool long long __b) {
+  return __a ^ (vector signed long long)__b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__a ^ __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_vxor(vector unsigned long long __a, vector bool long long __b) {
+  return __a ^ (vector unsigned long long)__b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_vxor(vector bool long long __a, vector bool long long __b) {
+  return __a ^ __b;
+}
+#endif
+
+/* ------------------------ extensions for CBEA ----------------------------- */
+
+/* vec_extract */
+
+static __inline__ signed char __ATTRS_o_ai vec_extract(vector signed char __a,
+                                                       int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned char __ATTRS_o_ai
+vec_extract(vector unsigned char __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
+                                                         int __b) {
+  return __a[__b];
+}
+
+static __inline__ signed short __ATTRS_o_ai vec_extract(vector signed short __a,
+                                                        int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned short __ATTRS_o_ai
+vec_extract(vector unsigned short __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
+                                                          int __b) {
+  return __a[__b];
+}
+
+static __inline__ signed int __ATTRS_o_ai vec_extract(vector signed int __a,
+                                                      int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a,
+                                                        int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector bool int __a,
+                                                        int __b) {
+  return __a[__b];
+}
+
+#ifdef __VSX__
+static __inline__ signed long long __ATTRS_o_ai
+vec_extract(vector signed long long __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned long long __ATTRS_o_ai
+vec_extract(vector unsigned long long __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ unsigned long long __ATTRS_o_ai
+vec_extract(vector bool long long __a, int __b) {
+  return __a[__b];
+}
+
+static __inline__ double __ATTRS_o_ai vec_extract(vector double __a, int __b) {
+  return __a[__b];
+}
+#endif
+
+static __inline__ float __ATTRS_o_ai vec_extract(vector float __a, int __b) {
+  return __a[__b];
+}
+
+#ifdef __POWER9_VECTOR__
+
+#define vec_insert4b __builtin_vsx_insertword
+#define vec_extract4b __builtin_vsx_extractuword
+
+/* vec_extract_exp */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_exp(vector float __a) {
+  return __builtin_vsx_xvxexpsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_exp(vector double __a) {
+  return __builtin_vsx_xvxexpdp(__a);
+}
+
+/* vec_extract_sig */
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_extract_sig(vector float __a) {
+  return __builtin_vsx_xvxsigsp(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_extract_sig (vector double __a) {
+  return __builtin_vsx_xvxsigdp(__a);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shorth(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 0, -1, 1, -1, 2, -1, 3, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 0, -1, 1, -1, 2, -1, 3);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_extract_fp32_from_shortl(vector unsigned short __a) {
+  vector unsigned short __b =
+#ifdef __LITTLE_ENDIAN__
+            __builtin_shufflevector(__a, __a, 4, -1, 5, -1, 6, -1, 7, -1);
+#else
+            __builtin_shufflevector(__a, __a, -1, 4, -1, 5, -1, 6, -1, 7);
+#endif
+  return __builtin_vsx_xvcvhpsp(__b);
+}
+#endif /* __POWER9_VECTOR__ */
+
+/* vec_insert */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_insert(signed char __a, vector signed char __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_insert(unsigned char __a, vector unsigned char __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
+                                                           vector bool char __b,
+                                                           int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_insert(signed short __a, vector signed short __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector unsigned short __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_insert(unsigned short __a, vector bool short __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_insert(signed int __a, vector signed int __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_insert(unsigned int __a, vector unsigned int __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
+                                                          vector bool int __b,
+                                                          int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_insert(signed long long __a, vector signed long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
+                                                        vector double __b,
+                                                        int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_insert(float __a,
+                                                       vector float __b,
+                                                       int __c) {
+  __b[__c] = __a;
+  return __b;
+}
+
+/* vec_lvlx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlx(int __a, const vector signed char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool char *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool short *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector pixel *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlx(int __a, const int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlx(int __a,
+                                                   const vector int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlx(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvlx(int __a, const vector bool int *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlx(int __a,
+                                                     const vector float *__b) {
+  return vec_perm(vec_ld(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvlxl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector signed char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector signed char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool char *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool char)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const vector short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool short *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool short)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const vector pixel *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector pixel)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlxl(int __a, const int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvlxl(int __a,
+                                                    const vector int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector unsigned int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector unsigned int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvlxl(int __a, const vector bool int *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector bool int)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      const float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvlxl(int __a,
+                                                      vector float *__b) {
+  return vec_perm(vec_ldl(__a, __b), (vector float)(0),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrx */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrx(int __a, const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrx(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrx(int __a,
+                                                   const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrx(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvrx(int __a, const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrx(int __a,
+                                                     const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ld(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_lvrxl */
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector signed char *__b) {
+  return vec_perm((vector signed char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned char *__b) {
+  return vec_perm((vector unsigned char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool char *__b) {
+  return vec_perm((vector bool char)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector short *__b) {
+  return vec_perm((vector short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned short *__b) {
+  return vec_perm((vector unsigned short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool short *__b) {
+  return vec_perm((vector bool short)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector pixel __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector pixel *__b) {
+  return vec_perm((vector pixel)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrxl(int __a, const int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_lvrxl(int __a,
+                                                    const vector int *__b) {
+  return vec_perm((vector int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, __b));
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector unsigned int *__b) {
+  return vec_perm((vector unsigned int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_lvrxl(int __a, const vector bool int *__b) {
+  return vec_perm((vector bool int)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b), vec_lvsl(__a, __b));
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_lvrxl(int __a,
+                                                      const vector float *__b) {
+  return vec_perm((vector float)(0), vec_ldl(__a, __b),
+                  vec_lvsl(__a, (unsigned char *)__b));
+}
+
+/* vec_stvlx */
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                              signed char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector signed char __a, int __b,
+                                              vector signed char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool char __a, int __b,
+                                              vector bool char *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                              short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector short __a, int __b,
+                                              vector short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool short __a, int __b,
+                                              vector bool short *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector pixel __a, int __b,
+                                              vector pixel *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector int __a, int __b,
+                                              int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector int __a, int __b,
+                                              vector int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
+  return vec_st(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector bool int __a, int __b,
+                                              vector bool int *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlx(vector float __a, int __b,
+                                              vector float *__c) {
+  return vec_st(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvlxl */
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                               signed char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector signed char __a, int __b,
+                                               vector signed char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                               short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector short __a, int __b,
+                                               vector short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector pixel __a, int __b,
+                                               vector pixel *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b,
+                                               int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector int __a, int __b,
+                                               vector int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  return vec_stl(vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector bool int __a, int __b,
+                                               vector bool int *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvlxl(vector float __a, int __b,
+                                               vector float *__c) {
+  return vec_stl(
+      vec_perm(vec_lvrx(__b, __c), __a, vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrx */
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                              signed char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector signed char __a, int __b,
+                                              vector signed char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                              unsigned char *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned char __a, int __b,
+                                              vector unsigned char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool char __a, int __b,
+                                              vector bool char *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                              short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector short __a, int __b,
+                                              vector short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned short __a,
+                                              int __b, unsigned short *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned short __a,
+                                              int __b,
+                                              vector unsigned short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool short __a, int __b,
+                                              vector bool short *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector pixel __a, int __b,
+                                              vector pixel *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector int __a, int __b,
+                                              int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector int __a, int __b,
+                                              vector int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                              unsigned int *__c) {
+  return vec_st(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector unsigned int __a, int __b,
+                                              vector unsigned int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector bool int __a, int __b,
+                                              vector bool int *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrx(vector float __a, int __b,
+                                              vector float *__c) {
+  return vec_st(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_stvrxl */
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                               signed char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector signed char __a, int __b,
+                                               vector signed char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a,
+                                               int __b, unsigned char *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned char __a,
+                                               int __b,
+                                               vector unsigned char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool char __a, int __b,
+                                               vector bool char *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                               short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector short __a, int __b,
+                                               vector short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a,
+                                               int __b, unsigned short *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned short __a,
+                                               int __b,
+                                               vector unsigned short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool short __a, int __b,
+                                               vector bool short *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector pixel __a, int __b,
+                                               vector pixel *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b,
+                                               int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector int __a, int __b,
+                                               vector int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                               unsigned int *__c) {
+  return vec_stl(vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, __c)), __b,
+                 __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector unsigned int __a, int __b,
+                                               vector unsigned int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector bool int __a, int __b,
+                                               vector bool int *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+static __inline__ void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,
+                                               vector float *__c) {
+  return vec_stl(
+      vec_perm(__a, vec_lvlx(__b, __c), vec_lvsr(__b, (unsigned char *)__c)),
+      __b, __c);
+}
+
+/* vec_promote */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_promote(signed char __a,
+                                                              int __b) {
+  vector signed char __res = (vector signed char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_promote(unsigned char __a, int __b) {
+  vector unsigned char __res = (vector unsigned char)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
+  vector short __res = (vector short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_promote(unsigned short __a, int __b) {
+  vector unsigned short __res = (vector unsigned short)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
+  vector int __res = (vector int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a,
+                                                               int __b) {
+  vector unsigned int __res = (vector unsigned int)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+static __inline__ vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
+  vector float __res = (vector float)(0);
+  __res[__b] = __a;
+  return __res;
+}
+
+/* vec_splats */
+
+static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a) {
+  return (vector signed char)(__a);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_splats(unsigned char __a) {
+  return (vector unsigned char)(__a);
+}
+
+static __inline__ vector short __ATTRS_o_ai vec_splats(short __a) {
+  return (vector short)(__a);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_splats(unsigned short __a) {
+  return (vector unsigned short)(__a);
+}
+
+static __inline__ vector int __ATTRS_o_ai vec_splats(int __a) {
+  return (vector int)(__a);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_splats(unsigned int __a) {
+  return (vector unsigned int)(__a);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_splats(signed long long __a) {
+  return (vector signed long long)(__a);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_splats(unsigned long long __a) {
+  return (vector unsigned long long)(__a);
+}
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_splats(signed __int128 __a) {
+  return (vector signed __int128)(__a);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_splats(unsigned __int128 __a) {
+  return (vector unsigned __int128)(__a);
+}
+
+#endif
+
+static __inline__ vector double __ATTRS_o_ai vec_splats(double __a) {
+  return (vector double)(__a);
+}
+#endif
+
+static __inline__ vector float __ATTRS_o_ai vec_splats(float __a) {
+  return (vector float)(__a);
+}
+
+/* ----------------------------- predicates --------------------------------- */
+
+/* vec_all_eq */
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
+                                      (vector long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_ge */
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, (vector signed char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __b, (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __b, (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __b, __a);
+}
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, (vector signed long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_ge(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_gt */
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __a, (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __a, (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __a, (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a, __b);
+}
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_gt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __a, __b);
+}
+#endif
+
+/* vec_all_in */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_in(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ, __a, __b);
+}
+
+/* vec_all_le */
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ, __a, (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, __a, (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, __a, (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ, (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_le(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_le(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_lt */
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT, (vector signed char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, __b, (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, __b, (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT, (vector signed long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT, (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_LT, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_lt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_LT, __b, __a);
+}
+#endif
+
+/* vec_all_nan */
+
+static __inline__ int __ATTRS_o_ai vec_all_nan(vector float __a) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ, __a, __a);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_nan(vector double __a) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __a);
+}
+#endif
+
+/* vec_all_ne */
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ, (vector signed long long)__a,
+                                      (vector signed long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nge */
+
+static __inline__ int __ATTRS_o_ai vec_all_nge(vector float __a,
+                                               vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_nge(vector double __a,
+                                               vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_ngt */
+
+static __inline__ int __ATTRS_o_ai vec_all_ngt(vector float __a,
+                                               vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_all_ngt(vector double __a,
+                                               vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ, __a, __b);
+}
+#endif
+
+/* vec_all_nle */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_nlt */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ, __b, __a);
+}
+
+/* vec_all_numeric */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_all_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT, __a, __a);
+}
+
+/* vec_any_eq */
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_EQ_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_EQ_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_EQ_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_EQ_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_ge */
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_ge(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_gt */
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_gt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __a, __b);
+}
+#endif
+
+/* vec_any_le */
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_LT_REV, __a,
+                                      (vector signed char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, __a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_LT_REV, (vector unsigned char)__a,
+                                      (vector unsigned char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, __a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_LT_REV, (vector unsigned short)__a,
+                                      (vector unsigned short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, __a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_LT_REV, (vector unsigned int)__a,
+                                      (vector unsigned int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV, __a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_LT_REV,
+                                      (vector unsigned long long)__a,
+                                      (vector unsigned long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_le(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgesp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgefp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_le(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgedp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_lt */
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtsb_p(__CR6_EQ_REV, (vector signed char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpgtub_p(__CR6_EQ_REV, (vector unsigned char)__b,
+                                      (vector unsigned char)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtsh_p(__CR6_EQ_REV, (vector short)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpgtuh_p(__CR6_EQ_REV, (vector unsigned short)__b,
+                                      (vector unsigned short)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtsw_p(__CR6_EQ_REV, (vector int)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned int)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpgtuw_p(__CR6_EQ_REV, (vector unsigned int)__b,
+                                      (vector unsigned int)__a);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtsd_p(__CR6_EQ_REV,
+                                      (vector signed long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b, __a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV, __b,
+                                      (vector unsigned long long)__a);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpgtud_p(__CR6_EQ_REV,
+                                      (vector unsigned long long)__b,
+                                      (vector unsigned long long)__a);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpgtsp_p(__CR6_EQ_REV, __b, __a);
+#else
+  return __builtin_altivec_vcmpgtfp_p(__CR6_EQ_REV, __b, __a);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_lt(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpgtdp_p(__CR6_EQ_REV, __b, __a);
+}
+#endif
+
+/* vec_any_nan */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nan(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __a);
+}
+
+/* vec_any_ne */
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector signed char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector unsigned char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool char __a,
+                                              vector bool char __b) {
+  return __builtin_altivec_vcmpequb_p(__CR6_LT_REV, (vector char)__a,
+                                      (vector char)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, __a, (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector unsigned short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool short __a,
+                                              vector bool short __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector pixel __a,
+                                              vector pixel __b) {
+  return __builtin_altivec_vcmpequh_p(__CR6_LT_REV, (vector short)__a,
+                                      (vector short)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector int __a, vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, __a, (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector unsigned int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
+                                              vector bool int __b) {
+  return __builtin_altivec_vcmpequw_p(__CR6_LT_REV, (vector int)__a,
+                                      (vector int)__b);
+}
+
+#ifdef __POWER8_VECTOR__
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
+                                      (vector long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
+                                      (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector signed long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector unsigned long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
+                                              vector bool long long __b) {
+  return __builtin_altivec_vcmpequd_p(
+      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+}
+#endif
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector float __a,
+                                              vector float __b) {
+#ifdef __VSX__
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_LT_REV, __a, __b);
+#else
+  return __builtin_altivec_vcmpeqfp_p(__CR6_LT_REV, __a, __b);
+#endif
+}
+
+#ifdef __VSX__
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector double __a,
+                                              vector double __b) {
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_LT_REV, __a, __b);
+}
+#endif
+
+/* vec_any_nge */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nge(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_ngt */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_ngt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __a, __b);
+}
+
+/* vec_any_nle */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nle(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgefp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_nlt */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_nlt(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpgtfp_p(__CR6_LT_REV, __b, __a);
+}
+
+/* vec_any_numeric */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_numeric(vector float __a) {
+  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ_REV, __a, __a);
+}
+
+/* vec_any_out */
+
+static __inline__ int __attribute__((__always_inline__))
+vec_any_out(vector float __a, vector float __b) {
+  return __builtin_altivec_vcmpbfp_p(__CR6_EQ_REV, __a, __b);
+}
+
+/* Power 8 Crypto functions
+Note: We diverge from the current GCC implementation with regard
+to cryptography and related functions as follows:
+- Only the SHA and AES instructions and builtins are disabled by -mno-crypto
+- The remaining ones are only available on Power8 and up so
+  require -mpower8-vector
+The justification for this is that export requirements require that
+Category:Vector.Crypto is optional (i.e. compliant hardware may not provide
+support). As a result, we need to be able to turn off support for those.
+The remaining ones (currently controlled by -mcrypto for GCC) still
+need to be provided on compliant hardware even if Vector.Crypto is not
+provided.
+*/
+#ifdef __CRYPTO__
+#define vec_sbox_be __builtin_altivec_crypto_vsbox
+#define vec_cipher_be __builtin_altivec_crypto_vcipher
+#define vec_cipherlast_be __builtin_altivec_crypto_vcipherlast
+#define vec_ncipher_be __builtin_altivec_crypto_vncipher
+#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vsbox(vector unsigned long long __a) {
+  return __builtin_altivec_crypto_vsbox(__a);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipher(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipher(__a, __b);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast(vector unsigned long long __a,
+                             vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vcipherlast(__a, __b);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipher(vector unsigned long long __a,
+                          vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipher(__a, __b);
+}
+
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast(vector unsigned long long __a,
+                              vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vncipherlast(__a, __b);
+}
+
+#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
+#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
+
+#define vec_shasigma_be(X, Y, Z)                                               \
+  _Generic((X), vector unsigned int                                            \
+           : __builtin_crypto_vshasigmaw, vector unsigned long long            \
+           : __builtin_crypto_vshasigmad)((X), (Y), (Z))
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool char __ATTRS_o_ai
+vec_permxor(vector bool char __a, vector bool char __b,
+            vector bool char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_permxor(vector signed char __a, vector signed char __b,
+            vector signed char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_permxor(vector unsigned char __a, vector unsigned char __b,
+            vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned char __a, vector unsigned char __b,
+                          vector unsigned char __c) {
+  return __builtin_altivec_crypto_vpermxor(__a, __b, __c);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned short __a, vector unsigned short __b,
+                          vector unsigned short __c) {
+  return (vector unsigned short)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai __builtin_crypto_vpermxor(
+    vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) {
+  return (vector unsigned int)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpermxor(vector unsigned long long __a,
+                          vector unsigned long long __b,
+                          vector unsigned long long __c) {
+  return (vector unsigned long long)__builtin_altivec_crypto_vpermxor(
+      (vector unsigned char)__a, (vector unsigned char)__b,
+      (vector unsigned char)__c);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_crypto_vpmsumb(__a, __b);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_altivec_crypto_vpmsumh(__a, __b);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_altivec_crypto_vpmsumw(__a, __b);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+__builtin_crypto_vpmsumb(vector unsigned long long __a,
+                         vector unsigned long long __b) {
+  return __builtin_altivec_crypto_vpmsumd(__a, __b);
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_vgbbd(vector signed char __a) {
+  return __builtin_altivec_vgbbd((vector unsigned char)__a);
+}
+
+#define vec_pmsum_be __builtin_crypto_vpmsumb
+#define vec_gb __builtin_altivec_vgbbd
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_vgbbd(vector unsigned char __a) {
+  return __builtin_altivec_vgbbd(__a);
+}
+
+static __inline__ vector long long __ATTRS_o_ai
+vec_vbpermq(vector signed char __a, vector signed char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char)__a,
+                                   (vector unsigned char)__b);
+}
+
+static __inline__ vector long long __ATTRS_o_ai
+vec_vbpermq(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq(__a, __b);
+}
+
+#ifdef __powerpc64__
+static __inline__ vector unsigned long long __attribute__((__always_inline__))
+vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq((vector unsigned char)__a,
+                                   (vector unsigned char)__b);
+}
+#endif
+#endif
+
+
+/* vec_reve */
+
+static inline __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed char vec_reve(vector signed char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_reve(vector unsigned char __a) {
+  return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                                 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_reve(vector signed int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_reve(vector unsigned int __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector bool short vec_reve(vector bool short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_reve(vector signed short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_reve(vector unsigned short __a) {
+  return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector float vec_reve(vector float __a) {
+  return __builtin_shufflevector(__a, __a, 3, 2, 1, 0);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector bool long long
+vec_reve(vector bool long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_reve(vector signed long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_reve(vector unsigned long long __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+
+static inline __ATTRS_o_ai vector double vec_reve(vector double __a) {
+  return __builtin_shufflevector(__a, __a, 1, 0);
+}
+#endif
+
+/* vec_revb */
+static __inline__ vector bool char __ATTRS_o_ai
+vec_revb(vector bool char __a) {
+  return __a;
+}
+
+static __inline__ vector signed char __ATTRS_o_ai
+vec_revb(vector signed char __a) {
+  return __a;
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_revb(vector unsigned char __a) {
+  return __a;
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_revb(vector bool short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed short __ATTRS_o_ai
+vec_revb(vector signed short __a) {
+  vector unsigned char __indices =
+      { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_revb(vector unsigned short __a) {
+  vector unsigned char __indices =
+     { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_revb(vector bool int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_revb(vector signed int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_revb(vector unsigned int __a) {
+  vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_revb(vector float __a) {
+ vector unsigned char __indices =
+      { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return vec_perm(__a, __a, __indices);
+}
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_revb(vector bool long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_revb(vector signed long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_revb(vector unsigned long long __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_revb(vector double __a) {
+  vector unsigned char __indices =
+      { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+  return vec_perm(__a, __a, __indices);
+}
+#endif /* End __VSX__ */
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_revb(vector signed __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector signed __int128)vec_perm((vector signed int)__a,
+                                          (vector signed int)__a,
+                                           __indices);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_revb(vector unsigned __int128 __a) {
+  vector unsigned char __indices =
+      { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  return (vector unsigned __int128)vec_perm((vector signed int)__a,
+                                            (vector signed int)__a,
+                                             __indices);
+}
+#endif /* END __POWER8_VECTOR__ && __powerpc64__ */
+
+/* vec_xl */
+
+static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
+                                                     signed char *__ptr) {
+  return *(vector signed char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(signed long long __offset, unsigned char *__ptr) {
+  return *(vector unsigned char *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
+                                                      signed short *__ptr) {
+  return *(vector signed short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(signed long long __offset, unsigned short *__ptr) {
+  return *(vector unsigned short *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
+                                                    signed int *__ptr) {
+  return *(vector signed int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
+                                                      unsigned int *__ptr) {
+  return *(vector unsigned int *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
+                                               float *__ptr) {
+  return *(vector float *)(__ptr + __offset);
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(signed long long __offset, signed long long *__ptr) {
+  return *(vector signed long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(signed long long __offset, unsigned long long *__ptr) {
+  return *(vector unsigned long long *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
+                                                double *__ptr) {
+  return *(vector double *)(__ptr + __offset);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai vector signed __int128
+vec_xl(signed long long __offset, signed __int128 *__ptr) {
+  return *(vector signed __int128 *)(__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned __int128
+vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
+  return *(vector unsigned __int128 *)(__ptr + __offset);
+}
+#endif
+
+/* vec_xl_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ vector signed char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed char *__ptr) {
+  vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned char *__ptr) {
+  vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                                 13, 12, 11, 10, 9, 8);
+}
+
+static __inline__ vector signed short  __ATTRS_o_ai
+vec_xl_be(signed long long __offset, signed short *__ptr) {
+  vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector unsigned short __ATTRS_o_ai
+vec_xl_be(signed long long __offset, unsigned short *__ptr) {
+  vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+}
+
+static __inline__ vector signed int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed int *__ptr) {
+  return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned int __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned int *__ptr) {
+  return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+static __inline__ vector float __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, float *__ptr) {
+  return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ vector signed long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed long long *__ptr) {
+  return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned long long *__ptr) {
+  return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+
+static __inline__ vector double __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, double *__ptr) {
+  return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ vector signed __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, signed __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+
+static __inline__ vector unsigned __int128 __ATTRS_o_ai
+vec_xl_be(signed long long  __offset, unsigned __int128 *__ptr) {
+  return vec_xl(__offset, __ptr);
+}
+#endif
+#else
+  #define vec_xl_be vec_xl
+#endif
+
+/* vec_xst */
+
+static inline __ATTRS_o_ai void vec_xst(vector signed char __vec,
+                                        signed long long __offset,
+                                        signed char *__ptr) {
+  *(vector signed char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec,
+                                        signed long long __offset,
+                                        unsigned char *__ptr) {
+  *(vector unsigned char *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed short __vec,
+                                        signed long long __offset,
+                                        signed short *__ptr) {
+  *(vector signed short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec,
+                                        signed long long __offset,
+                                        unsigned short *__ptr) {
+  *(vector unsigned short *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector signed int __vec,
+                                        signed long long __offset,
+                                        signed int *__ptr) {
+  *(vector signed int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec,
+                                        signed long long __offset,
+                                        unsigned int *__ptr) {
+  *(vector unsigned int *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector float __vec,
+                                        signed long long __offset,
+                                        float *__ptr) {
+  *(vector float *)(__ptr + __offset) = __vec;
+}
+
+#ifdef __VSX__
+static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec,
+                                        signed long long __offset,
+                                        signed long long *__ptr) {
+  *(vector signed long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec,
+                                        signed long long __offset,
+                                        unsigned long long *__ptr) {
+  *(vector unsigned long long *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector double __vec,
+                                        signed long long __offset,
+                                        double *__ptr) {
+  *(vector double *)(__ptr + __offset) = __vec;
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec,
+                                        signed long long __offset,
+                                        signed __int128 *__ptr) {
+  *(vector signed __int128 *)(__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
+                                        signed long long __offset,
+                                        unsigned __int128 *__ptr) {
+  *(vector unsigned __int128 *)(__ptr + __offset) = __vec;
+}
+#endif
+
+/* vec_xst_be */
+
+#ifdef __LITTLE_ENDIAN__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec,
+                                               signed long long  __offset,
+                                               signed char *__ptr) {
+  vector signed char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
+                                               signed long long  __offset,
+                                               unsigned char *__ptr) {
+  vector unsigned char __tmp =
+     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
+                             13, 12, 11, 10, 9, 8);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
+                                               signed long long  __offset,
+                                               signed short *__ptr) {
+  vector signed short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
+                                               signed long long  __offset,
+                                               unsigned short *__ptr) {
+  vector unsigned short __tmp =
+     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
+  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
+                                               signed long long  __offset,
+                                               signed int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec,
+                                               signed long long  __offset,
+                                               unsigned int *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec,
+                                               signed long long  __offset,
+                                               float *__ptr) {
+  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+}
+
+#ifdef __VSX__
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec,
+                                               signed long long  __offset,
+                                               signed long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec,
+                                               signed long long  __offset,
+                                               unsigned long long *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec,
+                                               signed long long  __offset,
+                                               double *__ptr) {
+  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+}
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed __int128 __vec,
+                                               signed long long  __offset,
+                                               signed __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+
+static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec,
+                                               signed long long  __offset,
+                                               unsigned __int128 *__ptr) {
+  vec_xst(__vec, __offset, __ptr);
+}
+#endif
+#else
+  #define vec_xst_be vec_xst
+#endif
+
+#ifdef __POWER9_VECTOR__
+#define vec_test_data_class(__a, __b)                                      \
+        _Generic((__a),                                                    \
+           vector float:                                                   \
+             (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)),       \
+           vector double:                                                  \
+             (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b))  \
+        )
+
+#endif /* #ifdef __POWER9_VECTOR__ */
+
+static vector float __ATTRS_o_ai vec_neg(vector float __a) {
+  return -__a;
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_neg(vector double __a) {
+  return -__a;
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_neg(vector long long __a) {
+  return -__a;
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_neg(vector signed int __a) {
+  return -__a;
+}
+
+static vector signed short __ATTRS_o_ai vec_neg(vector signed short __a) {
+  return -__a;
+}
+
+static vector signed char __ATTRS_o_ai vec_neg(vector signed char __a) {
+  return -__a;
+}
+
+static vector float __ATTRS_o_ai vec_nabs(vector float __a) {
+  return - vec_abs(__a);
+}
+
+#ifdef __VSX__
+static vector double __ATTRS_o_ai vec_nabs(vector double __a) {
+  return - vec_abs(__a);
+}
+
+#endif
+
+#if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
+static vector long long __ATTRS_o_ai vec_nabs(vector long long __a) {
+  return __builtin_altivec_vminsd(__a, -__a);
+}
+#endif
+
+static vector signed int __ATTRS_o_ai vec_nabs(vector signed int __a) {
+  return __builtin_altivec_vminsw(__a, -__a);
+}
+
+static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) {
+  return __builtin_altivec_vminsh(__a, -__a);
+}
+
+static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
+  return __builtin_altivec_vminsb(__a, -__a);
+}
+#undef __ATTRS_o_ai
+
+#endif /* __ALTIVEC_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/ammintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/ammintrin.h
new file mode 100644
index 000000000..2843a7a26
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/ammintrin.h
@@ -0,0 +1,193 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param x
+///    The value from which bits are extracted.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a x are extracted. If the length is zero but the
+///    index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
+///    extracted from the source operand.
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index and of the length specified by
+///    \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param __x
+///    The value from which bits are extracted.
+/// \param __y
+///    Specifies the index of the least significant bit at [13:8] and the
+///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
+///    length is interpreted as 64. If the sum of the index and length is
+///    greater than 64, the result is undefined. If the length and index are
+///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+///    from the source operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    \a y into the lower 64 bits of the destination integer vector \a x at
+///    the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length \a len and by the index \a idx specifying the
+///    least significant bit.
+/// \param y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a y of length \a len.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a x with the specified bitfields replaced by the
+///    lower bits of source operand \a y. The upper 64 bits of the return value
+///    are undefined.
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    \a __y into the lower 64 bits of the destination integer vector \a __x
+///    at the index and of the length specified by \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param __x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length and by the index of the least significant bit
+///    specified by operand \a __y.
+/// \param __y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a __y with length
+///    specified by bits [69:64]. These are inserted into the destination at the
+///    index specified by bits [77:72]; all other bits are ignored. If bits
+///    [69:64] are zero, the length is interpreted as 64. If the sum of the
+///    index and length is greater than 64, the result is undefined. If the
+///    length and index are both zero, bits [63:0] of parameter \a __y are
+///    inserted into parameter \a __x. If the length is zero but the index is
+///    non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a __x with the specified bitfields replaced by the
+///    lower bits of source operand \a __y. The upper 64 bits of the return
+///    value are undefined.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
+///
+/// \param __p
+///    The 64-bit memory location used to store the register value.
+/// \param __a
+///    The 64-bit double-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+///    memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
+///
+/// \param __p
+///    The 32-bit memory location used to store the register value.
+/// \param __a
+///    The 32-bit single-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/arm_acle.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/arm_acle.h
new file mode 100644
index 000000000..8423e62a3
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/arm_acle.h
@@ -0,0 +1,312 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 8.3 Memory barriers */
+#if !defined(_MSC_VER)
+#define __dmb(i) __builtin_arm_dmb(i)
+#define __dsb(i) __builtin_arm_dsb(i)
+#define __isb(i) __builtin_arm_isb(i)
+#endif
+
+/* 8.4 Hints */
+
+#if !defined(_MSC_VER)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
+  __builtin_arm_wfi();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
+  __builtin_arm_wfe();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
+  __builtin_arm_sev();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
+  __builtin_arm_sevl();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
+  __builtin_arm_yield();
+}
+#endif
+
+#if __ARM_32BIT_STATE
+#define __dbg(t) __builtin_arm_dbg(t)
+#endif
+
+/* 8.5 Swap */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__swp(uint32_t __x, volatile uint32_t *__p) {
+  uint32_t v;
+  do
+    v = __builtin_arm_ldrex(__p);
+  while (__builtin_arm_strex(__x, __p));
+  return v;
+}
+
+/* 8.6 Memory prefetch intrinsics */
+/* 8.6.1 Data prefetch */
+#define __pld(addr) __pldx(0, 0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, 1)
+#else
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
+#endif
+
+/* 8.6.2 Instruction prefetch */
+#define __pli(addr) __plix(0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, 0)
+#else
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
+#endif
+
+/* 8.7 NOP */
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
+  __builtin_arm_nop();
+}
+
+/* 9 DATA-PROCESSING INTRINSICS */
+/* 9.2 Miscellaneous data-processing intrinsics */
+/* ROR */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__ror(uint32_t __x, uint32_t __y) {
+  __y %= 32;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (32 - __y));
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rorll(uint64_t __x, uint32_t __y) {
+  __y %= 64;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (64 - __y));
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rorl(unsigned long __x, uint32_t __y) {
+#if __SIZEOF_LONG__ == 4
+  return __ror(__x, __y);
+#else
+  return __rorll(__x, __y);
+#endif
+}
+
+
+/* CLZ */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__clz(uint32_t __t) {
+  return __builtin_clz(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__clzl(unsigned long __t) {
+  return __builtin_clzl(__t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__clzll(uint64_t __t) {
+  return __builtin_clzll(__t);
+}
+
+/* REV */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev(uint32_t __t) {
+  return __builtin_bswap32(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__revl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(__t);
+#else
+  return __builtin_bswap64(__t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__revll(uint64_t __t) {
+  return __builtin_bswap64(__t);
+}
+
+/* REV16 */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev16(uint32_t __t) {
+  return __ror(__rev(__t), 16);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rev16ll(uint64_t __t) {
+  return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rev16l(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+    return __rev16(__t);
+#else
+    return __rev16ll(__t);
+#endif
+}
+
+/* REVSH */
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
+__revsh(int16_t __t) {
+  return __builtin_bswap16(__t);
+}
+
+/* RBIT */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rbit(uint32_t __t) {
+  return __builtin_arm_rbit(__t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rbitll(uint64_t __t) {
+#if __ARM_32BIT_STATE
+  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
+         __builtin_arm_rbit(__t >> 32);
+#else
+  return __builtin_arm_rbit64(__t);
+#endif
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rbitl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __rbit(__t);
+#else
+  return __rbitll(__t);
+#endif
+}
+
+/*
+ * 9.4 Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+/* 9.4.1 Width-specified saturation intrinsics */
+#if __ARM_32BIT_STATE
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+#endif
+
+/* 9.4.2 Saturating addition and subtraction intrinsics */
+#if __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qadd(int32_t __t, int32_t __v) {
+  return __builtin_arm_qadd(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qsub(int32_t __t, int32_t __v) {
+  return __builtin_arm_qsub(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qdbl(int32_t __t) {
+  return __builtin_arm_qadd(__t, __t);
+}
+#endif
+
+/* 9.7 CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32b(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32b(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32h(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32h(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32w(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32w(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32d(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32d(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cb(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32cb(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32ch(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32ch(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cw(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32cw(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cd(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32cd(__a, __b);
+}
+#endif
+
+/* 10.1 Special register intrinsics */
+#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
+#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
+#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
+#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/arm_neon.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/arm_neon.h
new file mode 100644
index 000000000..f5ca59bb3
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/arm_neon.h
@@ -0,0 +1,69231 @@
+/*===---- arm_neon.h - ARM Neon intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_NEON_H
+#define __ARM_NEON_H
+
+#if !defined(__ARM_NEON)
+#error "NEON support not enabled"
+#endif
+
+#include <stdint.h>
+
+typedef float float32_t;
+typedef __fp16 float16_t;
+#ifdef __aarch64__
+typedef double float64_t;
+#endif
+
+#ifdef __aarch64__
+typedef uint8_t poly8_t;
+typedef uint16_t poly16_t;
+typedef uint64_t poly64_t;
+typedef __uint128_t poly128_t;
+#else
+typedef int8_t poly8_t;
+typedef int16_t poly16_t;
+#endif
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
+typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
+typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
+typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
+typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
+typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
+typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
+#ifdef __aarch64__
+typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
+typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+#endif
+typedef __attribute__((neon_polyvector_type(8))) poly8_t poly8x8_t;
+typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
+typedef __attribute__((neon_polyvector_type(4))) poly16_t poly16x4_t;
+typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t;
+#ifdef __aarch64__
+typedef __attribute__((neon_polyvector_type(1))) poly64_t poly64x1_t;
+typedef __attribute__((neon_polyvector_type(2))) poly64_t poly64x2_t;
+#endif
+
+typedef struct int8x8x2_t {
+  int8x8_t val[2];
+} int8x8x2_t;
+
+typedef struct int8x16x2_t {
+  int8x16_t val[2];
+} int8x16x2_t;
+
+typedef struct int16x4x2_t {
+  int16x4_t val[2];
+} int16x4x2_t;
+
+typedef struct int16x8x2_t {
+  int16x8_t val[2];
+} int16x8x2_t;
+
+typedef struct int32x2x2_t {
+  int32x2_t val[2];
+} int32x2x2_t;
+
+typedef struct int32x4x2_t {
+  int32x4_t val[2];
+} int32x4x2_t;
+
+typedef struct int64x1x2_t {
+  int64x1_t val[2];
+} int64x1x2_t;
+
+typedef struct int64x2x2_t {
+  int64x2_t val[2];
+} int64x2x2_t;
+
+typedef struct uint8x8x2_t {
+  uint8x8_t val[2];
+} uint8x8x2_t;
+
+typedef struct uint8x16x2_t {
+  uint8x16_t val[2];
+} uint8x16x2_t;
+
+typedef struct uint16x4x2_t {
+  uint16x4_t val[2];
+} uint16x4x2_t;
+
+typedef struct uint16x8x2_t {
+  uint16x8_t val[2];
+} uint16x8x2_t;
+
+typedef struct uint32x2x2_t {
+  uint32x2_t val[2];
+} uint32x2x2_t;
+
+typedef struct uint32x4x2_t {
+  uint32x4_t val[2];
+} uint32x4x2_t;
+
+typedef struct uint64x1x2_t {
+  uint64x1_t val[2];
+} uint64x1x2_t;
+
+typedef struct uint64x2x2_t {
+  uint64x2_t val[2];
+} uint64x2x2_t;
+
+typedef struct float16x4x2_t {
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t {
+  float16x8_t val[2];
+} float16x8x2_t;
+
+typedef struct float32x2x2_t {
+  float32x2_t val[2];
+} float32x2x2_t;
+
+typedef struct float32x4x2_t {
+  float32x4_t val[2];
+} float32x4x2_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x2_t {
+  float64x1_t val[2];
+} float64x1x2_t;
+
+typedef struct float64x2x2_t {
+  float64x2_t val[2];
+} float64x2x2_t;
+
+#endif
+typedef struct poly8x8x2_t {
+  poly8x8_t val[2];
+} poly8x8x2_t;
+
+typedef struct poly8x16x2_t {
+  poly8x16_t val[2];
+} poly8x16x2_t;
+
+typedef struct poly16x4x2_t {
+  poly16x4_t val[2];
+} poly16x4x2_t;
+
+typedef struct poly16x8x2_t {
+  poly16x8_t val[2];
+} poly16x8x2_t;
+
+#ifdef __aarch64__
+typedef struct poly64x1x2_t {
+  poly64x1_t val[2];
+} poly64x1x2_t;
+
+typedef struct poly64x2x2_t {
+  poly64x2_t val[2];
+} poly64x2x2_t;
+
+#endif
+typedef struct int8x8x3_t {
+  int8x8_t val[3];
+} int8x8x3_t;
+
+typedef struct int8x16x3_t {
+  int8x16_t val[3];
+} int8x16x3_t;
+
+typedef struct int16x4x3_t {
+  int16x4_t val[3];
+} int16x4x3_t;
+
+typedef struct int16x8x3_t {
+  int16x8_t val[3];
+} int16x8x3_t;
+
+typedef struct int32x2x3_t {
+  int32x2_t val[3];
+} int32x2x3_t;
+
+typedef struct int32x4x3_t {
+  int32x4_t val[3];
+} int32x4x3_t;
+
+typedef struct int64x1x3_t {
+  int64x1_t val[3];
+} int64x1x3_t;
+
+typedef struct int64x2x3_t {
+  int64x2_t val[3];
+} int64x2x3_t;
+
+typedef struct uint8x8x3_t {
+  uint8x8_t val[3];
+} uint8x8x3_t;
+
+typedef struct uint8x16x3_t {
+  uint8x16_t val[3];
+} uint8x16x3_t;
+
+typedef struct uint16x4x3_t {
+  uint16x4_t val[3];
+} uint16x4x3_t;
+
+typedef struct uint16x8x3_t {
+  uint16x8_t val[3];
+} uint16x8x3_t;
+
+typedef struct uint32x2x3_t {
+  uint32x2_t val[3];
+} uint32x2x3_t;
+
+typedef struct uint32x4x3_t {
+  uint32x4_t val[3];
+} uint32x4x3_t;
+
+typedef struct uint64x1x3_t {
+  uint64x1_t val[3];
+} uint64x1x3_t;
+
+typedef struct uint64x2x3_t {
+  uint64x2_t val[3];
+} uint64x2x3_t;
+
+typedef struct float16x4x3_t {
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t {
+  float16x8_t val[3];
+} float16x8x3_t;
+
+typedef struct float32x2x3_t {
+  float32x2_t val[3];
+} float32x2x3_t;
+
+typedef struct float32x4x3_t {
+  float32x4_t val[3];
+} float32x4x3_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x3_t {
+  float64x1_t val[3];
+} float64x1x3_t;
+
+typedef struct float64x2x3_t {
+  float64x2_t val[3];
+} float64x2x3_t;
+
+#endif
+typedef struct poly8x8x3_t {
+  poly8x8_t val[3];
+} poly8x8x3_t;
+
+typedef struct poly8x16x3_t {
+  poly8x16_t val[3];
+} poly8x16x3_t;
+
+typedef struct poly16x4x3_t {
+  poly16x4_t val[3];
+} poly16x4x3_t;
+
+typedef struct poly16x8x3_t {
+  poly16x8_t val[3];
+} poly16x8x3_t;
+
+#ifdef __aarch64__
+typedef struct poly64x1x3_t {
+  poly64x1_t val[3];
+} poly64x1x3_t;
+
+typedef struct poly64x2x3_t {
+  poly64x2_t val[3];
+} poly64x2x3_t;
+
+#endif
+typedef struct int8x8x4_t {
+  int8x8_t val[4];
+} int8x8x4_t;
+
+typedef struct int8x16x4_t {
+  int8x16_t val[4];
+} int8x16x4_t;
+
+typedef struct int16x4x4_t {
+  int16x4_t val[4];
+} int16x4x4_t;
+
+typedef struct int16x8x4_t {
+  int16x8_t val[4];
+} int16x8x4_t;
+
+typedef struct int32x2x4_t {
+  int32x2_t val[4];
+} int32x2x4_t;
+
+typedef struct int32x4x4_t {
+  int32x4_t val[4];
+} int32x4x4_t;
+
+typedef struct int64x1x4_t {
+  int64x1_t val[4];
+} int64x1x4_t;
+
+typedef struct int64x2x4_t {
+  int64x2_t val[4];
+} int64x2x4_t;
+
+typedef struct uint8x8x4_t {
+  uint8x8_t val[4];
+} uint8x8x4_t;
+
+typedef struct uint8x16x4_t {
+  uint8x16_t val[4];
+} uint8x16x4_t;
+
+typedef struct uint16x4x4_t {
+  uint16x4_t val[4];
+} uint16x4x4_t;
+
+typedef struct uint16x8x4_t {
+  uint16x8_t val[4];
+} uint16x8x4_t;
+
+typedef struct uint32x2x4_t {
+  uint32x2_t val[4];
+} uint32x2x4_t;
+
+typedef struct uint32x4x4_t {
+  uint32x4_t val[4];
+} uint32x4x4_t;
+
+typedef struct uint64x1x4_t {
+  uint64x1_t val[4];
+} uint64x1x4_t;
+
+typedef struct uint64x2x4_t {
+  uint64x2_t val[4];
+} uint64x2x4_t;
+
+typedef struct float16x4x4_t {
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t {
+  float16x8_t val[4];
+} float16x8x4_t;
+
+typedef struct float32x2x4_t {
+  float32x2_t val[4];
+} float32x2x4_t;
+
+typedef struct float32x4x4_t {
+  float32x4_t val[4];
+} float32x4x4_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x4_t {
+  float64x1_t val[4];
+} float64x1x4_t;
+
+typedef struct float64x2x4_t {
+  float64x2_t val[4];
+} float64x2x4_t;
+
+#endif
+typedef struct poly8x8x4_t {
+  poly8x8_t val[4];
+} poly8x8x4_t;
+
+typedef struct poly8x16x4_t {
+  poly8x16_t val[4];
+} poly8x16x4_t;
+
+typedef struct poly16x4x4_t {
+  poly16x4_t val[4];
+} poly16x4x4_t;
+
+typedef struct poly16x8x4_t {
+  poly16x8_t val[4];
+} poly16x8x4_t;
+
+#ifdef __aarch64__
+typedef struct poly64x1x4_t {
+  poly64x1_t val[4];
+} poly64x1x4_t;
+
+typedef struct poly64x2x4_t {
+  poly64x2_t val[4];
+} poly64x2x4_t;
+
+#endif
+
+#define __ai static inline __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x16_t __noswap_vabdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vabdq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vabdq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x16_t __noswap_vabdq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vabdq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vabdq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vabdq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vabdq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vabd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vabd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vabd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vabd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vabd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vabd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vabd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vabd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vabd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vabd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vabd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vabd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vabd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vabd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vabsq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vabsq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vabsq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vabsq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabsq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vabsq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabsq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vabsq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vabs_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabs_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vabs_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vabs_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabs_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vabs_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vabs_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabs_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vabs_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vabs_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabs_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vabs_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vaddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vaddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vaddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vaddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vaddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vaddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vaddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vandq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vandq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vandq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vandq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vandq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vandq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vandq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vandq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vandq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vandq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vandq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vandq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vandq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vandq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vandq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vandq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vand_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vand_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vand_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vand_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vand_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vand_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vand_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vand_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vand_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vand_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vand_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vand_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vand_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vand_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vand_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 & __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vand_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 & __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vbicq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vbicq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vbicq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vbicq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vbicq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vbicq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vbicq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vbicq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vbicq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vbicq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vbicq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vbicq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vbicq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vbicq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vbicq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vbicq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vbic_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vbic_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vbic_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vbic_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vbic_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vbic_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vbic_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vbic_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vbic_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vbic_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vbic_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vbic_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vbic_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vbic_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vbic_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 & ~__p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vbic_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 & ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vbsl_p8(uint8x8_t __p0, poly8x8_t __p1, poly8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vbsl_p8(uint8x8_t __p0, poly8x8_t __p1, poly8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vbsl_p16(uint16x4_t __p0, poly16x4_t __p1, poly16x4_t __p2) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 5);
+  return __ret;
+}
+#else
+__ai poly16x4_t vbsl_p16(uint16x4_t __p0, poly16x4_t __p1, poly16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vbslq_p8(uint8x16_t __p0, poly8x16_t __p1, poly8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vbslq_p8(uint8x16_t __p0, poly8x16_t __p1, poly8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vbslq_p16(uint16x8_t __p0, poly16x8_t __p1, poly16x8_t __p2) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 37);
+  return __ret;
+}
+#else
+__ai poly16x8_t vbslq_p16(uint16x8_t __p0, poly16x8_t __p1, poly16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 37);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vbslq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vbslq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vbslq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vbslq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vbslq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vbslq_u64(uint64x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vbslq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vbslq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vbslq_s8(uint8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vbslq_s8(uint8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vbslq_f32(uint32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vbslq_f32(uint32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vbslq_s32(uint32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vbslq_s32(uint32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vbslq_s64(uint64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vbslq_s64(uint64x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vbslq_s16(uint16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vbslq_s16(uint16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vbsl_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vbsl_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vbsl_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vbsl_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vbsl_u64(uint64x1_t __p0, uint64x1_t __p1, uint64x1_t __p2) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vbsl_u64(uint64x1_t __p0, uint64x1_t __p1, uint64x1_t __p2) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vbsl_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vbsl_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vbsl_s8(uint8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vbsl_s8(uint8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vbsl_f32(uint32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vbsl_f32(uint32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vbsl_s32(uint32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vbsl_s32(uint32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vbsl_s64(uint64x1_t __p0, int64x1_t __p1, int64x1_t __p2) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vbsl_s64(uint64x1_t __p0, int64x1_t __p1, int64x1_t __p2) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vbsl_s16(uint16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vbsl_s16(uint16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcageq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcageq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcage_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcage_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcage_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcagtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcagtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcagt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcagt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcagt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcaleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcaleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcale_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcale_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcale_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcaltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcaltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcalt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcalt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcalt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceq_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceq_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceq_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceq_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceq_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceq_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceq_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceq_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceq_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceq_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceq_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceq_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceq_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceq_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceq_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceq_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgeq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgeq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgeq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgeq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgeq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgeq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgeq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgeq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgeq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgeq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgeq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgeq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgeq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgeq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcge_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcge_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcge_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcge_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcge_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcge_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcge_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcge_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcge_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcge_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcge_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcge_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcge_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcge_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgtq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgtq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgtq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgtq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgt_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgt_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgt_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgt_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgt_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgt_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcleq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcleq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcleq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcleq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcleq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcleq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcleq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcleq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcleq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcleq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcleq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcleq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcleq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcle_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcle_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcle_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcle_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcle_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcle_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcle_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcle_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcle_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcle_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcle_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcle_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcle_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcle_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vclsq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vclsq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vclsq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vclsq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vclsq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vclsq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclsq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vcls_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vcls_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcls_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcls_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcls_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vcls_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vcls_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcltq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcltq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcltq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcltq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclt_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclt_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclt_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclt_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclt_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclt_f32(float32x2_t __p0, float32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclt_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclt_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclt_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclt_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclt_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vclzq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vclzq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vclzq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vclzq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vclzq_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vclzq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vclzq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vclzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vclzq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vclzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vclzq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclzq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vclzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vclzq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclz_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclz_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclz_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclz_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclz_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclz_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vclz_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vclz_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vclz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vclz_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vclz_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vclz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vclz_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vclz_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vclz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vclz_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vcnt_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vcnt_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vcntq_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vcntq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcntq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcntq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vcntq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vcntq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vcntq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vcntq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcnt_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcnt_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vcnt_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcnt_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vcnt_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vcnt_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vcombine_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai poly8x16_t vcombine_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vcombine_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai poly16x8_t vcombine_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x16_t __noswap_vcombine_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vcombine_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcombine_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcombine_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vcombine_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai int8x16_t vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x16_t __noswap_vcombine_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai float32x4_t vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcombine_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai float16x8_t vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x8_t __noswap_vcombine_f16(float16x4_t __p0, float16x4_t __p1) {
+  float16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai int32x4_t vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vcombine_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcombine_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai int64x2_t vcombine_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai int16x8_t vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vcombine_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vcreate_p8(uint64_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vcreate_p8(uint64_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vcreate_p16(uint64_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vcreate_p16(uint64_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcreate_u8(uint64_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcreate_u8(uint64_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcreate_u32(uint64_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcreate_u32(uint64_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcreate_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcreate_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcreate_u16(uint64_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcreate_u16(uint64_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vcreate_s8(uint64_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vcreate_s8(uint64_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcreate_f32(uint64_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vcreate_f32(uint64_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcreate_f16(uint64_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vcreate_f16(uint64_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcreate_s32(uint64_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vcreate_s32(uint64_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcreate_s64(uint64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vcreate_s64(uint64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vcreate_s16(uint64_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vcreate_s16(uint64_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvtq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai float32x4_t vcvtq_f32_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvtq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai float32x4_t vcvtq_f32_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvtq_f32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvt_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvt_f32_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvt_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvt_f32_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vcvtq_n_f32_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vcvt_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vcvt_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vcvt_n_f32_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vcvtq_n_s32_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vcvtq_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vcvtq_n_s32_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vcvt_n_s32_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vcvt_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vcvt_n_s32_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_n_u32_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vcvtq_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_n_u32_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vcvt_n_u32_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vcvt_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vcvt_n_u32_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvt_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvt_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvt_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvt_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvt_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvt_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvt_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvt_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vdup_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x8_t vdup_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vdup_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x4_t vdup_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vdupq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x16_t vdupq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vdupq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x8_t vdupq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vdupq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x16_t vdupq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vdupq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x4_t vdupq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vdupq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint64x2_t vdupq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vdupq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x8_t vdupq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vdupq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x16_t vdupq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vdupq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x4_t vdupq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vdupq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vdupq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x4_t vdupq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vdupq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int64x2_t vdupq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vdupq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x8_t vdupq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vdup_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x8_t vdup_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vdup_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x2_t vdup_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vdup_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai uint64x1_t vdup_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vdup_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x4_t vdup_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vdup_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x8_t vdup_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vdup_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x2_t vdup_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vdup_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vdup_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x2_t vdup_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vdup_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai int64x1_t vdup_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vdup_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x4_t vdup_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t veorq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t veorq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t veorq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t veorq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t veorq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t veorq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t veorq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t veorq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t veorq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t veorq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t veorq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t veorq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t veorq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t veorq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t veorq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t veorq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t veor_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t veor_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t veor_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t veor_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t veor_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t veor_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t veor_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t veor_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t veor_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t veor_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t veor_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t veor_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t veor_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t veor_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t veor_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 ^ __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t veor_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 ^ __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vext_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vext_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vextq_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vextq_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vextq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vextq_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vextq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vextq_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 41); \
+  __ret; \
+})
+#else
+#define vextq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vextq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vextq_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vextq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vext_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vext_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vext_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vext_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vext_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 9); \
+  __ret; \
+})
+#else
+#define vext_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vext_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vext_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vext_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vget_high_p8(poly8x16_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vget_high_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai poly8x8_t __noswap_vget_high_p8(poly8x16_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vget_high_p16(poly16x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vget_high_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vget_high_u8(uint8x16_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vget_high_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vget_high_u8(uint8x16_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vget_high_u32(uint32x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vget_high_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vget_high_u32(uint32x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vget_high_u64(uint64x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vget_high_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vget_high_u16(uint16x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vget_high_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vget_high_u16(uint16x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vget_high_s8(int8x16_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vget_high_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 8, 9, 10, 11, 12, 13, 14, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vget_high_s8(int8x16_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 8, 9, 10, 11, 12, 13, 14, 15);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vget_high_f32(float32x4_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vget_high_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vget_high_f32(float32x4_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vget_high_f16(float16x8_t __p0) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai float16x4_t vget_high_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vget_high_f16(float16x8_t __p0) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vget_high_s32(int32x4_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vget_high_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vget_high_s32(int32x4_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 2, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vget_high_s64(int64x2_t __p0) {
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai int64x1_t vget_high_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vget_high_s16(int16x8_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vget_high_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vget_high_s16(int16x8_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 4, 5, 6, 7);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vget_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vgetq_lane_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vgetq_lane_f32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vgetq_lane_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vget_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vget_lane_f32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vget_lane_f32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vget_lane_f32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vget_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vget_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vget_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vget_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vget_low_p8(poly8x16_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai poly8x8_t vget_low_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vget_low_p16(poly16x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai poly16x4_t vget_low_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vget_low_u8(uint8x16_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai uint8x8_t vget_low_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vget_low_u32(uint32x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
+  return __ret;
+}
+#else
+__ai uint32x2_t vget_low_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vget_low_u64(uint64x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vget_low_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vget_low_u16(uint16x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai uint16x4_t vget_low_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vget_low_s8(int8x16_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __ret;
+}
+#else
+__ai int8x8_t vget_low_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3, 4, 5, 6, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vget_low_f32(float32x4_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
+  return __ret;
+}
+#else
+__ai float32x2_t vget_low_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vget_low_f16(float16x8_t __p0) {
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai float16x4_t vget_low_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vget_low_s32(int32x4_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1);
+  return __ret;
+}
+#else
+__ai int32x2_t vget_low_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vget_low_s64(int64x2_t __p0) {
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai int64x1_t vget_low_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vget_low_s16(int16x8_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0, 1, 2, 3);
+  return __ret;
+}
+#else
+__ai int16x4_t vget_low_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0, 1, 2, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vhsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vhsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vhsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vhsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vhsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vhsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vhsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vhsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vhsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vhsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vhsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vhsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vhsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vhsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vhsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vhsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vhsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vhsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vhsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vhsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vhsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vhsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vhsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vhsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_v(__p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_v(__p0, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_v(__p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_v(__p0, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_v(__p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_v(__p0, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_v(__p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_v(__p0, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_v(__p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_v(__p0, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_v(__p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_v(__p0, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_v(__p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_v(__p0, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_v(__p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_v(__p0, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_v(__p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_v(__p0, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_v(__p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_v(__p0, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_v(__p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_v(__p0, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_v(__p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_v(__p0, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_v(__p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_v(__p0, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_v(__p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_v(__p0, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_v(__p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_v(__p0, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_v(__p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_v(__p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_v(__p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_v(__p0, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_v(__p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_v(__p0, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_v(__p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_v(__p0, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_v(__p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_v(__p0, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_v(__p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_v(__p0, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_v(__p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_v(__p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_v(__p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_v(__p0, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_dup_v(__p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_dup_p8(__p0) __extension__ ({ \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_dup_v(__p0, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_dup_v(__p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_dup_p16(__p0) __extension__ ({ \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_dup_v(__p0, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_dup_v(__p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_dup_p8(__p0) __extension__ ({ \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_dup_v(__p0, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_dup_v(__p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_dup_p16(__p0) __extension__ ({ \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_dup_v(__p0, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_dup_v(__p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u8(__p0) __extension__ ({ \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_dup_v(__p0, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_dup_v(__p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u32(__p0) __extension__ ({ \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_dup_v(__p0, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_dup_v(__p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u64(__p0) __extension__ ({ \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_dup_v(__p0, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_dup_v(__p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_dup_u16(__p0) __extension__ ({ \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_dup_v(__p0, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_dup_v(__p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s8(__p0) __extension__ ({ \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_dup_v(__p0, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_dup_v(__p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_dup_f32(__p0) __extension__ ({ \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_dup_v(__p0, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_dup_v(__p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_dup_f16(__p0) __extension__ ({ \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_dup_v(__p0, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_dup_v(__p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s32(__p0) __extension__ ({ \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_dup_v(__p0, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_dup_v(__p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s64(__p0) __extension__ ({ \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_dup_v(__p0, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_dup_v(__p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_dup_s16(__p0) __extension__ ({ \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_dup_v(__p0, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_dup_v(__p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_dup_u8(__p0) __extension__ ({ \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_dup_v(__p0, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_dup_v(__p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_dup_u32(__p0) __extension__ ({ \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_dup_v(__p0, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_dup_v(__p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_dup_u64(__p0) __extension__ ({ \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_dup_v(__p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_dup_v(__p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_dup_u16(__p0) __extension__ ({ \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_dup_v(__p0, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_dup_v(__p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_dup_s8(__p0) __extension__ ({ \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_dup_v(__p0, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_dup_v(__p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_dup_f32(__p0) __extension__ ({ \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_dup_v(__p0, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_dup_v(__p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_dup_f16(__p0) __extension__ ({ \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_dup_v(__p0, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_dup_v(__p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_dup_s32(__p0) __extension__ ({ \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_dup_v(__p0, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_dup_v(__p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_dup_s64(__p0) __extension__ ({ \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_dup_v(__p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_dup_v(__p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_dup_s16(__p0) __extension__ ({ \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_dup_v(__p0, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vld1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vld1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vld1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vld1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vld1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 41); \
+  __ret; \
+})
+#else
+#define vld1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 40); \
+  __ret; \
+})
+#else
+#define vld1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 40); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vld1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vld1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vld1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vld1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vld1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vld1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 9); \
+  __ret; \
+})
+#else
+#define vld1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 8); \
+  __ret; \
+})
+#else
+#define vld1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 8); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vld1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vld1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vld1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld2_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld2_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld2q_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld2q_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld2q_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld2q_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld2q_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld2q_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld2q_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld2q_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld2q_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld2q_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld2_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld2_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld2_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld2_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld2_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld2_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld2_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld2_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld2_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld2_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld2_dup_p8(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld2_dup_p16(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld2_dup_u8(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld2_dup_u32(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld2_dup_u64(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld2_dup_u16(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld2_dup_s8(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld2_dup_f32(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld2_dup_f16(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld2_dup_s32(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld2_dup_s64(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld2_dup_s16(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 4); \
+  __ret; \
+})
+#else
+#define vld2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 5); \
+  __ret; \
+})
+#else
+#define vld2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 37); \
+  __ret; \
+})
+#else
+#define vld2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 50); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 49); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 41); \
+  __ret; \
+})
+#else
+#define vld2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 40); \
+  __ret; \
+})
+#else
+#define vld2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 34); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 33); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 16); \
+  __ret; \
+})
+#else
+#define vld2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 18); \
+  __ret; \
+})
+#else
+#define vld2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 17); \
+  __ret; \
+})
+#else
+#define vld2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 0); \
+  __ret; \
+})
+#else
+#define vld2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 9); \
+  __ret; \
+})
+#else
+#define vld2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 8); \
+  __ret; \
+})
+#else
+#define vld2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 2); \
+  __ret; \
+})
+#else
+#define vld2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 1); \
+  __ret; \
+})
+#else
+#define vld2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld3_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld3_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld3q_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld3q_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld3q_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld3q_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld3q_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld3q_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld3q_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld3q_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld3q_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld3q_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld3_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld3_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld3_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld3_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld3_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld3_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld3_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld3_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld3_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld3_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld3_dup_p8(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld3_dup_p16(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld3_dup_u8(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld3_dup_u32(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld3_dup_u64(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld3_dup_u16(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld3_dup_s8(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld3_dup_f32(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld3_dup_f16(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld3_dup_s32(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld3_dup_s64(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld3_dup_s16(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 4); \
+  __ret; \
+})
+#else
+#define vld3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 5); \
+  __ret; \
+})
+#else
+#define vld3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 37); \
+  __ret; \
+})
+#else
+#define vld3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 50); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 49); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 41); \
+  __ret; \
+})
+#else
+#define vld3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 40); \
+  __ret; \
+})
+#else
+#define vld3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 34); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 33); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 16); \
+  __ret; \
+})
+#else
+#define vld3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 18); \
+  __ret; \
+})
+#else
+#define vld3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 17); \
+  __ret; \
+})
+#else
+#define vld3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 0); \
+  __ret; \
+})
+#else
+#define vld3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 9); \
+  __ret; \
+})
+#else
+#define vld3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 8); \
+  __ret; \
+})
+#else
+#define vld3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 2); \
+  __ret; \
+})
+#else
+#define vld3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 1); \
+  __ret; \
+})
+#else
+#define vld3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld4_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld4_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld4q_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld4q_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld4q_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld4q_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld4q_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld4q_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld4q_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld4q_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld4q_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld4q_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld4_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld4_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld4_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld4_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld4_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld4_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld4_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld4_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld4_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld4_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld4_dup_p8(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld4_dup_p16(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld4_dup_u8(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld4_dup_u32(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld4_dup_u64(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld4_dup_u16(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld4_dup_s8(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld4_dup_f32(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld4_dup_f16(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld4_dup_s32(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld4_dup_s64(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld4_dup_s16(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 4); \
+  __ret; \
+})
+#else
+#define vld4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 5); \
+  __ret; \
+})
+#else
+#define vld4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 37); \
+  __ret; \
+})
+#else
+#define vld4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 50); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 49); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 41); \
+  __ret; \
+})
+#else
+#define vld4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 40); \
+  __ret; \
+})
+#else
+#define vld4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 34); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 33); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 16); \
+  __ret; \
+})
+#else
+#define vld4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 18); \
+  __ret; \
+})
+#else
+#define vld4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 17); \
+  __ret; \
+})
+#else
+#define vld4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 0); \
+  __ret; \
+})
+#else
+#define vld4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 9); \
+  __ret; \
+})
+#else
+#define vld4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 8); \
+  __ret; \
+})
+#else
+#define vld4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 2); \
+  __ret; \
+})
+#else
+#define vld4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 1); \
+  __ret; \
+})
+#else
+#define vld4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmlaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmlaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmlaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x16_t vmlaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x4_t vmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x4_t vmlaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x8_t vmlaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmla_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmla_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmla_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmla_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmla_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmla_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmla_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x8_t vmla_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x2_t vmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmla_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x2_t vmla_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmla_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x4_t vmla_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlaq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __p1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlaq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __rev1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlaq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __p1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlaq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __rev1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 + __p1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x4_t vmlaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 + __rev1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlaq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __p1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x4_t vmlaq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __rev1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlaq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __p1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x8_t vmlaq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __rev1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmla_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 + __p1 * (uint32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmla_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __rev1 * (uint32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmla_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 + __p1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmla_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __rev1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmla_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 + __p1 * (float32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x2_t vmla_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 + __rev1 * (float32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmla_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 + __p1 * (int32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x2_t vmla_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __rev1 * (int32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmla_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 + __p1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x4_t vmla_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __rev1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmlsq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmlsq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmlsq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x16_t vmlsq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x4_t vmlsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmls_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmls_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmls_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmls_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmls_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmls_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmls_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int8x8_t vmls_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmls_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float32x2_t vmls_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmls_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int32x2_t vmls_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmls_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai int16x4_t vmls_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __p1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsq_n_u32(uint32x4_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __rev1 * (uint32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - __p1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsq_n_u16(uint16x8_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __rev1 * (uint16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmlsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = __p0 - __p1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x4_t vmlsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 - __rev1 * (float32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __p1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsq_n_s32(int32x4_t __p0, int32x4_t __p1, int32_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __rev1 * (int32x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - __p1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsq_n_s16(int16x8_t __p0, int16x8_t __p1, int16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __rev1 * (int16x8_t) {__p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmls_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 - __p1 * (uint32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmls_n_u32(uint32x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 - __rev1 * (uint32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmls_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 - __p1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmls_n_u16(uint16x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 - __rev1 * (uint16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmls_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = __p0 - __p1 * (float32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float32x2_t vmls_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 - __rev1 * (float32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmls_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 - __p1 * (int32x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai int32x2_t vmls_n_s32(int32x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 - __rev1 * (int32x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmls_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 - __p1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  return __ret;
+}
+#else
+__ai int16x4_t vmls_n_s16(int16x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 - __rev1 * (int16x4_t) {__p2, __p2, __p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vmov_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x8_t vmov_n_p8(poly8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vmov_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x4_t vmov_n_p16(poly16_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vmovq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly8x16_t vmovq_n_p8(poly8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vmovq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai poly16x8_t vmovq_n_p16(poly16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmovq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x16_t vmovq_n_u8(uint8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmovq_n_u32(uint32_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmovq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint64x2_t vmovq_n_u64(uint64_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmovq_n_u16(uint16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmovq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x16_t vmovq_n_s8(int8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmovq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x4_t vmovq_n_f32(float32_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmovq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vmovq_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x4_t vmovq_n_s32(int32_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmovq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int64x2_t vmovq_n_s64(int64_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x8_t vmovq_n_s16(int16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmov_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint8x8_t vmov_n_u8(uint8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmov_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmov_n_u32(uint32_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vmov_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai uint64x1_t vmov_n_u64(uint64_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmov_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmov_n_u16(uint16_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmov_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int8x8_t vmov_n_s8(int8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmov_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float32x2_t vmov_n_f32(float32_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmov_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret; \
+})
+#else
+#define vmov_n_f16(__p0) __extension__ ({ \
+  float16_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmov_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai int32x2_t vmov_n_s32(int32_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vmov_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai int64x1_t vmov_n_s64(int64_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmov_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  return __ret;
+}
+#else
+__ai int16x4_t vmov_n_s16(int16_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) {__p0, __p0, __p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovl_u8(uint8x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmovl_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmovl_u8(uint8x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 49);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmovl_u32(uint32x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmovl_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmovl_u32(uint32x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 51);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovl_u16(uint16x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmovl_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmovl_u16(uint16x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovl_s8(int8x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vmovl_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmovl_s8(int8x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmovl_s32(int32x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vmovl_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmovl_s32(int32x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovl_s16(int16x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmovl_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmovl_s16(int16x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmovl_v((int8x8_t)__p0, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vmovn_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vmovn_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vmovn_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vmovn_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vmovn_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vmovn_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmulq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmulq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmulq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmulq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmulq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmulq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmulq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vmulq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmulq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vmulq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmulq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vmulq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmulq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vmulq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmul_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmul_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmul_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmul_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmul_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmul_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmul_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vmul_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmul_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vmul_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmul_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vmul_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmul_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vmul_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vmul_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vmul_v((int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vmul_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vmul_v((int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vmulq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vmulq_v((int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vmulq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vmulq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmulq_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 * (uint32x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai uint32x4_t vmulq_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 * (uint32x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmulq_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 * (uint16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai uint16x8_t vmulq_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 * (uint16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmulq_n_f32(float32x4_t __p0, float32_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 * (float32x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai float32x4_t vmulq_n_f32(float32x4_t __p0, float32_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 * (float32x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmulq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 * (int32x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai int32x4_t vmulq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 * (int32x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmulq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 * (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai int16x8_t vmulq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 * (int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmul_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 * (uint32x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai uint32x2_t vmul_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 * (uint32x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmul_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 * (uint16x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai uint16x4_t vmul_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 * (uint16x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmul_n_f32(float32x2_t __p0, float32_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 * (float32x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai float32x2_t vmul_n_f32(float32x2_t __p0, float32_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 * (float32x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmul_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 * (int32x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai int32x2_t vmul_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 * (int32x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmul_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 * (int16x4_t) {__p1, __p1, __p1, __p1};
+  return __ret;
+}
+#else
+__ai int16x4_t vmul_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 * (int16x4_t) {__p1, __p1, __p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8_t vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 37);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai poly16x8_t __noswap_vmull_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 37);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmull_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 49);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmull_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 51);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmull_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmull_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vmull_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmull_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint32x2_t) {__p1, __p1}, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(uint32x2_t) {__p1, __p1}, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmull_n_u32(uint32x2_t __p0, uint32_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint32x2_t) {__p1, __p1}, 51);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint16x4_t) {__p1, __p1, __p1, __p1}, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(uint16x4_t) {__p1, __p1, __p1, __p1}, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmull_n_u16(uint16x4_t __p0, uint16_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(uint16x4_t) {__p1, __p1, __p1, __p1}, 50);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vmvn_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai poly8x8_t vmvn_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vmvnq_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai poly8x16_t vmvnq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmvnq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint8x16_t vmvnq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmvnq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint32x4_t vmvnq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmvnq_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint16x8_t vmvnq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmvnq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int8x16_t vmvnq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmvnq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int32x4_t vmvnq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmvnq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int16x8_t vmvnq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vmvn_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint8x8_t vmvn_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vmvn_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint32x2_t vmvn_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vmvn_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai uint16x4_t vmvn_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vmvn_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int8x8_t vmvn_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vmvn_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int32x2_t vmvn_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vmvn_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = ~__p0;
+  return __ret;
+}
+#else
+__ai int16x4_t vmvn_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = ~__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vnegq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int8x16_t vnegq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vnegq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float32x4_t vnegq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vnegq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int32x4_t vnegq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vnegq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int16x8_t vnegq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vneg_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int8x8_t vneg_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vneg_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float32x2_t vneg_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vneg_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int32x2_t vneg_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vneg_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int16x4_t vneg_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vornq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vornq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vornq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vornq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vornq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vornq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vornq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vornq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vornq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vornq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vornq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vornq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vornq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vornq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vornq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vornq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vorn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vorn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vorn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vorn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vorn_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vorn_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vorn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vorn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vorn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vorn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vorn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vorn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vorn_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vorn_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vorn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 | ~__p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vorn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 | ~__rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vorrq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vorrq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vorrq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vorrq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vorrq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vorrq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vorrq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vorrq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vorrq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vorrq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vorrq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vorrq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vorrq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vorrq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vorrq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vorrq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vorr_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vorr_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vorr_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vorr_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vorr_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vorr_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vorr_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vorr_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vorr_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vorr_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vorr_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vorr_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vorr_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vorr_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vorr_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 | __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vorr_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 | __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpadalq_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpadalq_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vpadalq_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vpadalq_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpadalq_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpadalq_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpadalq_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpadalq_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vpadalq_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vpadalq_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpadalq_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpadalq_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpadalq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpadal_u8(uint16x4_t __p0, uint8x8_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpadal_u8(uint16x4_t __p0, uint8x8_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vpadal_u32(uint64x1_t __p0, uint32x2_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vpadal_u32(uint64x1_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__rev1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpadal_u16(uint32x2_t __p0, uint16x4_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpadal_u16(uint32x2_t __p0, uint16x4_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpadal_s8(int16x4_t __p0, int8x8_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpadal_s8(int16x4_t __p0, int8x8_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vpadal_s32(int64x1_t __p0, int32x2_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vpadal_s32(int64x1_t __p0, int32x2_t __p1) {
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__rev1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpadal_s16(int32x2_t __p0, int16x4_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadal_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpadal_s16(int32x2_t __p0, int16x4_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadal_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vpadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vpadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vpadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vpadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpadd_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpaddlq_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpaddlq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpaddlq_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpaddlq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vpaddlq_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vpaddlq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpaddlq_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpaddlq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpaddl_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpaddl_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vpaddl_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vpaddl_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpaddl_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpaddl_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpaddl_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpaddl_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vpaddl_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vpaddl_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpaddl_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpaddl_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpaddl_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vpmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vpmax_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpmax_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpmax_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vpmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vpmax_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpmax_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpmax_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpmax_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vpmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vpmin_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vpmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vpmin_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vpmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vpmin_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vpmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vpmin_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpmin_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vpmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vpmin_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vpmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vpmin_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqabsq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqabsq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqabsq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqabsq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqabsq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqabsq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqabs_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqabs_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqabs_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqabs_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqabs_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqabs_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqabs_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlal_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int32x2_t) {__p2, __p2}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__rev0, (int8x8_t)__rev1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmlsl_v((int8x16_t)__p0, (int8x8_t)__p1, (int8x8_t)(int16x4_t) {__p2, __p2, __p2, __p2}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__p0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqdmulh_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmull_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmull_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vqdmull_n_s32(int32x2_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 35);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqdmull_n_s16(int16x4_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqdmull_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqmovn_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vqmovn_u32(uint32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqmovn_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vqmovn_u64(uint64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqmovn_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vqmovn_u16(uint16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqmovn_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqmovn_s32(int32x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqmovn_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqmovn_s64(int64x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqmovn_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vqmovn_s16(int16x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqmovn_v((int8x16_t)__p0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqmovun_s32(int32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqmovun_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vqmovun_s32(int32x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqmovun_s64(int64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqmovun_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vqmovun_s64(int64x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqmovun_s16(int16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqmovun_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vqmovun_s16(int16x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqmovun_v((int8x16_t)__p0, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqnegq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqnegq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqnegq_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqnegq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqnegq_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqnegq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqneg_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqneg_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqneg_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqneg_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqneg_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqneg_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqneg_v((int8x8_t)__rev0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqrdmulhq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqrdmulhq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqrdmulh_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqrdmulh_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqrdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqrdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqrdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqrdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqrdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqrdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqrdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqrdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmulhq_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int32x4_t) {__p1, __p1, __p1, __p1}, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__p0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmulhq_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrdmulhq_v((int8x16_t)__rev0, (int8x16_t)(int16x8_t) {__p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1}, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmulh_n_s32(int32x2_t __p0, int32_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)(int32x2_t) {__p1, __p1}, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__p0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmulh_n_s16(int16x4_t __p0, int16_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrdmulh_v((int8x8_t)__rev0, (int8x8_t)(int16x4_t) {__p1, __p1, __p1, __p1}, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqrshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqrshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vqshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vqshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vqshlq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vqshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vqshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vqshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vqshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vqshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshl_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshluq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vqshluq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vqshluq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlu_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshlu_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshlu_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshlu_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vqshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vqshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vqshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vqshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vqshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrun_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vqshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrun_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vqshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vqshrun_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vqshrun_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vqsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vqsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vqsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vqsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vqsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vqsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vqsubq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vqsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vqsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vqsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vqsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vqsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vqsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vqsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vqsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vqsub_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vraddhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vraddhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vraddhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vraddhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vraddhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vraddhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vraddhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrecpeq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrecpeq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrecpeq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrecpeq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrecpe_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrecpe_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrecpe_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrecpe_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrecpsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrecpsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrecps_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrecps_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrev16_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrev16_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrev16q_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrev16q_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrev16q_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrev16q_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrev16q_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  return __ret;
+}
+#else
+__ai int8x16_t vrev16q_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrev16_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrev16_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrev16_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai int8x8_t vrev16_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrev32_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrev32_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vrev32_p16(poly16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai poly16x4_t vrev32_p16(poly16x4_t __p0) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrev32q_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrev32q_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vrev32q_p16(poly16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai poly16x8_t vrev32q_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrev32q_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrev32q_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrev32q_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrev32q_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrev32q_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  return __ret;
+}
+#else
+__ai int8x16_t vrev32q_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrev32q_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2, 5, 4, 7, 6);
+  return __ret;
+}
+#else
+__ai int16x8_t vrev32q_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2, 5, 4, 7, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrev32_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrev32_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrev32_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrev32_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrev32_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai int8x8_t vrev32_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrev32_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai int16x4_t vrev32_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrev64_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrev64_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vrev64_p16(poly16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vrev64_p16(poly16x4_t __p0) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrev64q_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrev64q_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vrev64q_p16(poly16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai poly16x8_t vrev64q_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrev64q_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrev64q_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrev64q_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrev64q_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrev64q_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrev64q_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrev64q_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  return __ret;
+}
+#else
+__ai int8x16_t vrev64q_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrev64q_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai float32x4_t vrev64q_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrev64q_s32(int32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0, 3, 2);
+  return __ret;
+}
+#else
+__ai int32x4_t vrev64q_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0, 3, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrev64q_s16(int16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4);
+  return __ret;
+}
+#else
+__ai int16x8_t vrev64q_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrev64_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrev64_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrev64_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrev64_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrev64_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrev64_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrev64_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrev64_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrev64_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
+  return __ret;
+}
+#else
+__ai float32x2_t vrev64_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrev64_s32(int32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1, 0);
+  return __ret;
+}
+#else
+__ai int32x2_t vrev64_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrev64_s16(int16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  return __ret;
+}
+#else
+__ai int16x4_t vrev64_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrhaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrhaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrhaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vrhaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vrhaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vrhaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrhaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrhadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrhadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrhadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrhadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vrhadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vrhadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrhadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vrshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vrshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vrshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vrshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vrshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vrshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vrshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vrshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vrshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vrshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vrshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vrshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vrshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrshrq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vrshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vrshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vrshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vrshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vrshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vrshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vrshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vrshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshr_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vrshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrsqrteq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vrsqrteq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrsqrteq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrsqrteq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrsqrte_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrsqrte_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrsqrte_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrsqrte_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrsqrtsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrsqrtsq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrsqrts_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrsqrts_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vrsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vrsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vrsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vrsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vrsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vrsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vrsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vrsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vrsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vrsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vrsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vrsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vrsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vrsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vrsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vrsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vrsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vrsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsetq_lane_i8(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vsetq_lane_f32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsetq_lane_i16(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vset_lane_i8(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vset_lane_f32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vset_lane_i32(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vset_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vset_lane_i16(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vshlq_u8(uint8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vshlq_u32(uint32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vshlq_u64(uint64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vshlq_u16(uint16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vshlq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vshlq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vshlq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vshlq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vshlq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vshlq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vshl_u8(uint8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vshl_u32(uint32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vshl_u64(uint64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vshl_u16(uint16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vshl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vshl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vshl_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vshl_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vshl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vshl_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vshlq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vshlq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vshlq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vshlq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vshlq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vshlq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vshlq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vshlq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshlq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vshl_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vshl_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vshl_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vshl_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vshl_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vshl_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vshl_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vshl_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshl_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vshll_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 49); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vshll_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 51); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vshll_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 50); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vshll_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 33); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vshll_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 35); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vshll_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshll_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshll_n_v((int8x8_t)__s0, __p1, 34); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 48); \
+  __ret; \
+})
+#else
+#define vshrq_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 50); \
+  __ret; \
+})
+#else
+#define vshrq_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vshrq_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 49); \
+  __ret; \
+})
+#else
+#define vshrq_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 32); \
+  __ret; \
+})
+#else
+#define vshrq_n_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 34); \
+  __ret; \
+})
+#else
+#define vshrq_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vshrq_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__s0, __p1, 33); \
+  __ret; \
+})
+#else
+#define vshrq_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vshrq_n_v((int8x16_t)__rev0, __p1, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vshr_n_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vshr_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vshr_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vshr_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vshr_n_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vshr_n_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vshr_n_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vshr_n_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshr_n_v((int8x8_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#else
+#define vshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 17); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#else
+#define vshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 18); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#else
+#define vshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 16); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#else
+#define vshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#else
+#define vshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#else
+#define vshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__rev0, __p1, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vshrn_n_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vshrn_n_v((int8x16_t)__s0, __p1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vsli_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vsli_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vsliq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vsliq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vsliq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vsliq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vsliq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vsliq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vsliq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vsliq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vsliq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vsliq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vsli_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vsli_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vsli_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vsli_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vsli_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vsli_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vsli_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vsli_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsli_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vsraq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vsraq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vsraq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vsraq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vsraq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vsraq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vsraq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vsraq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsraq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vsra_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vsra_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vsra_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vsra_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vsra_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vsra_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vsra_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vsra_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsra_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 4); \
+  __ret; \
+})
+#else
+#define vsri_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = (poly8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 4); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 5); \
+  __ret; \
+})
+#else
+#define vsri_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = (poly16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 5); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \
+  __ret; \
+})
+#else
+#define vsriq_n_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = (poly8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \
+  __ret; \
+})
+#else
+#define vsriq_n_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = (poly16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
+  __ret; \
+})
+#else
+#define vsriq_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = (uint8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 50); \
+  __ret; \
+})
+#else
+#define vsriq_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = (uint32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 50); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 51); \
+  __ret; \
+})
+#else
+#define vsriq_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \
+  __ret; \
+})
+#else
+#define vsriq_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = (uint16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \
+  __ret; \
+})
+#else
+#define vsriq_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = (int8x16_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 34); \
+  __ret; \
+})
+#else
+#define vsriq_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = (int32x4_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 34); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 35); \
+  __ret; \
+})
+#else
+#define vsriq_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \
+  __ret; \
+})
+#else
+#define vsriq_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = (int16x8_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 16); \
+  __ret; \
+})
+#else
+#define vsri_n_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = (uint8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 16); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 18); \
+  __ret; \
+})
+#else
+#define vsri_n_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = (uint32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 18); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#else
+#define vsri_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64x1_t __s1 = __p1; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 17); \
+  __ret; \
+})
+#else
+#define vsri_n_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = (uint16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 17); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 0); \
+  __ret; \
+})
+#else
+#define vsri_n_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = (int8x8_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 0); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 2); \
+  __ret; \
+})
+#else
+#define vsri_n_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __ret; \
+  __ret = (int32x2_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#else
+#define vsri_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64x1_t __s1 = __p1; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 1); \
+  __ret; \
+})
+#else
+#define vsri_n_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = (int16x4_t) __builtin_neon_vsri_n_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 4); \
+})
+#else
+#define vst1_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 5); \
+})
+#else
+#define vst1_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 36); \
+})
+#else
+#define vst1q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 37); \
+})
+#else
+#define vst1q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 48); \
+})
+#else
+#define vst1q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 50); \
+})
+#else
+#define vst1q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 51); \
+})
+#else
+#define vst1q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 49); \
+})
+#else
+#define vst1q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 32); \
+})
+#else
+#define vst1q_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 41); \
+})
+#else
+#define vst1q_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 40); \
+})
+#else
+#define vst1q_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 34); \
+})
+#else
+#define vst1q_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 35); \
+})
+#else
+#define vst1q_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 33); \
+})
+#else
+#define vst1q_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 16); \
+})
+#else
+#define vst1_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 18); \
+})
+#else
+#define vst1_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 19); \
+})
+#else
+#define vst1_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 17); \
+})
+#else
+#define vst1_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 0); \
+})
+#else
+#define vst1_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 9); \
+})
+#else
+#define vst1_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 8); \
+})
+#else
+#define vst1_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 2); \
+})
+#else
+#define vst1_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 3); \
+})
+#else
+#define vst1_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 1); \
+})
+#else
+#define vst1_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__rev1, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 4); \
+})
+#else
+#define vst1_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8_t __s1 = __p1; \
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 5); \
+})
+#else
+#define vst1_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4_t __s1 = __p1; \
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 36); \
+})
+#else
+#define vst1q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16_t __s1 = __p1; \
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 37); \
+})
+#else
+#define vst1q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8_t __s1 = __p1; \
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 48); \
+})
+#else
+#define vst1q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16_t __s1 = __p1; \
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 50); \
+})
+#else
+#define vst1q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 51); \
+})
+#else
+#define vst1q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2_t __s1 = __p1; \
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 49); \
+})
+#else
+#define vst1q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 32); \
+})
+#else
+#define vst1q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16_t __s1 = __p1; \
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 41); \
+})
+#else
+#define vst1q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 40); \
+})
+#else
+#define vst1q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8_t __s1 = __p1; \
+  float16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 34); \
+})
+#else
+#define vst1q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 35); \
+})
+#else
+#define vst1q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2_t __s1 = __p1; \
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 33); \
+})
+#else
+#define vst1q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 16); \
+})
+#else
+#define vst1_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8_t __s1 = __p1; \
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 18); \
+})
+#else
+#define vst1_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+})
+#else
+#define vst1_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 17); \
+})
+#else
+#define vst1_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 0); \
+})
+#else
+#define vst1_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8_t __s1 = __p1; \
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 9); \
+})
+#else
+#define vst1_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 8); \
+})
+#else
+#define vst1_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4_t __s1 = __p1; \
+  float16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 2); \
+})
+#else
+#define vst1_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+})
+#else
+#define vst1_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 1); \
+})
+#else
+#define vst1_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__rev1, __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
+})
+#else
+#define vst2_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
+})
+#else
+#define vst2_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
+})
+#else
+#define vst2q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
+})
+#else
+#define vst2q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
+})
+#else
+#define vst2q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
+})
+#else
+#define vst2q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
+})
+#else
+#define vst2q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
+})
+#else
+#define vst2q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 41); \
+})
+#else
+#define vst2q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 40); \
+})
+#else
+#define vst2q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 34); \
+})
+#else
+#define vst2q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 33); \
+})
+#else
+#define vst2q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
+})
+#else
+#define vst2_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
+})
+#else
+#define vst2_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#else
+#define vst2_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
+})
+#else
+#define vst2_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s8(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
+})
+#else
+#define vst2_s8(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_f32(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 9); \
+})
+#else
+#define vst2_f32(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_f16(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 8); \
+})
+#else
+#define vst2_f16(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s32(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 2); \
+})
+#else
+#define vst2_s32(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s64(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#else
+#define vst2_s64(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_s16(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 1); \
+})
+#else
+#define vst2_s16(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 4); \
+})
+#else
+#define vst2_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 5); \
+})
+#else
+#define vst2_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 37); \
+})
+#else
+#define vst2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 50); \
+})
+#else
+#define vst2q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 49); \
+})
+#else
+#define vst2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 41); \
+})
+#else
+#define vst2q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 40); \
+})
+#else
+#define vst2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 34); \
+})
+#else
+#define vst2q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 33); \
+})
+#else
+#define vst2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 16); \
+})
+#else
+#define vst2_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 18); \
+})
+#else
+#define vst2_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 17); \
+})
+#else
+#define vst2_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 0); \
+})
+#else
+#define vst2_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 9); \
+})
+#else
+#define vst2_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 8); \
+})
+#else
+#define vst2_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 2); \
+})
+#else
+#define vst2_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 1); \
+})
+#else
+#define vst2_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst2_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
+})
+#else
+#define vst3_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
+})
+#else
+#define vst3_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
+})
+#else
+#define vst3q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
+})
+#else
+#define vst3q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
+})
+#else
+#define vst3q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
+})
+#else
+#define vst3q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
+})
+#else
+#define vst3q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
+})
+#else
+#define vst3q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \
+})
+#else
+#define vst3q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \
+})
+#else
+#define vst3q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \
+})
+#else
+#define vst3q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \
+})
+#else
+#define vst3q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
+})
+#else
+#define vst3_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
+})
+#else
+#define vst3_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#else
+#define vst3_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
+})
+#else
+#define vst3_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s8(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
+})
+#else
+#define vst3_s8(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_f32(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \
+})
+#else
+#define vst3_f32(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_f16(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \
+})
+#else
+#define vst3_f16(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s32(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \
+})
+#else
+#define vst3_s32(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s64(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#else
+#define vst3_s64(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_s16(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \
+})
+#else
+#define vst3_s16(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 4); \
+})
+#else
+#define vst3_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 5); \
+})
+#else
+#define vst3_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 37); \
+})
+#else
+#define vst3q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 50); \
+})
+#else
+#define vst3q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 49); \
+})
+#else
+#define vst3q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 41); \
+})
+#else
+#define vst3q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 40); \
+})
+#else
+#define vst3q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 34); \
+})
+#else
+#define vst3q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 33); \
+})
+#else
+#define vst3q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 16); \
+})
+#else
+#define vst3_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 18); \
+})
+#else
+#define vst3_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 17); \
+})
+#else
+#define vst3_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 0); \
+})
+#else
+#define vst3_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 9); \
+})
+#else
+#define vst3_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 8); \
+})
+#else
+#define vst3_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 2); \
+})
+#else
+#define vst3_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 1); \
+})
+#else
+#define vst3_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst3_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
+})
+#else
+#define vst4_p8(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
+})
+#else
+#define vst4_p16(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
+})
+#else
+#define vst4q_p8(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
+})
+#else
+#define vst4q_p16(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
+})
+#else
+#define vst4q_u8(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
+})
+#else
+#define vst4q_u32(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
+})
+#else
+#define vst4q_u16(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
+})
+#else
+#define vst4q_s8(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \
+})
+#else
+#define vst4q_f32(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \
+})
+#else
+#define vst4q_f16(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \
+})
+#else
+#define vst4q_s32(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \
+})
+#else
+#define vst4q_s16(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
+})
+#else
+#define vst4_u8(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
+})
+#else
+#define vst4_u32(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#else
+#define vst4_u64(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
+})
+#else
+#define vst4_u16(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s8(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
+})
+#else
+#define vst4_s8(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_f32(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \
+})
+#else
+#define vst4_f32(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_f16(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \
+})
+#else
+#define vst4_f16(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s32(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \
+})
+#else
+#define vst4_s32(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s64(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#else
+#define vst4_s64(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_s16(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \
+})
+#else
+#define vst4_s16(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 4); \
+})
+#else
+#define vst4_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 5); \
+})
+#else
+#define vst4_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 37); \
+})
+#else
+#define vst4q_lane_p16(__p0, __p1, __p2) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 50); \
+})
+#else
+#define vst4q_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 49); \
+})
+#else
+#define vst4q_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 41); \
+})
+#else
+#define vst4q_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 40); \
+})
+#else
+#define vst4q_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 34); \
+})
+#else
+#define vst4q_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 33); \
+})
+#else
+#define vst4q_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 16); \
+})
+#else
+#define vst4_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 18); \
+})
+#else
+#define vst4_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 17); \
+})
+#else
+#define vst4_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 0); \
+})
+#else
+#define vst4_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], __p2, 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 9); \
+})
+#else
+#define vst4_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 8); \
+})
+#else
+#define vst4_lane_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 2); \
+})
+#else
+#define vst4_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 1); \
+})
+#else
+#define vst4_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst4_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint8x16_t vsubq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int8x16_t vsubq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vsubq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vsubq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int32x4_t vsubq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int64x2_t vsubq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int16x8_t vsubq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint8x8_t vsub_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint32x2_t vsub_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint64x1_t vsub_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai uint16x4_t vsub_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int8x8_t vsub_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vsub_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vsub_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int32x2_t vsub_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int64x1_t vsub_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai int16x4_t vsub_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x4_t __noswap_vsubhn_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 17);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint32x2_t __noswap_vsubhn_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 18);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint8x8_t __noswap_vsubhn_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 16);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x4_t __noswap_vsubhn_s32(int32x4_t __p0, int32x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int32x2_t __noswap_vsubhn_s64(int64x2_t __p0, int64x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__rev0, (int8x16_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int8x8_t __noswap_vsubhn_s16(int16x8_t __p0, int16x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vsubhn_v((int8x16_t)__p0, (int8x16_t)__p1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_u8(__p0) - vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_u8(__rev0) - __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_u32(__p0) - vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_u32(__rev0) - __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_u16(__p0) - vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_u16(__rev0) - __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_s8(__p0) - vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_s8(__rev0) - __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_s32(__p0) - vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_s32(__rev0) - __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_s16(__p0) - vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_s16(__rev0) - __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 - vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 - vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 - vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 - vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl1_p8(poly8x8_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl1_p8(poly8x8_t __p0, uint8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl1_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl2_p8(poly8x8x2_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl2_p8(poly8x8x2_t __p0, uint8x8_t __p1) {
+  poly8x8x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl2_u8(uint8x8x2_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl2_u8(uint8x8x2_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl2_s8(int8x8x2_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl2_s8(int8x8x2_t __p0, int8x8_t __p1) {
+  int8x8x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl2_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl3_p8(poly8x8x3_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl3_p8(poly8x8x3_t __p0, uint8x8_t __p1) {
+  poly8x8x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl3_u8(uint8x8x3_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl3_u8(uint8x8x3_t __p0, uint8x8_t __p1) {
+  uint8x8x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl3_s8(int8x8x3_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl3_s8(int8x8x3_t __p0, int8x8_t __p1) {
+  int8x8x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl3_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbl4_p8(poly8x8x4_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbl4_p8(poly8x8x4_t __p0, uint8x8_t __p1) {
+  poly8x8x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbl4_u8(uint8x8x4_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbl4_u8(uint8x8x4_t __p0, uint8x8_t __p1) {
+  uint8x8x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbl4_s8(int8x8x4_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__p0.val[0], (int8x8_t)__p0.val[1], (int8x8_t)__p0.val[2], (int8x8_t)__p0.val[3], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbl4_s8(int8x8x4_t __p0, int8x8_t __p1) {
+  int8x8x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbl4_v((int8x8_t)__rev0.val[0], (int8x8_t)__rev0.val[1], (int8x8_t)__rev0.val[2], (int8x8_t)__rev0.val[3], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx1_p8(poly8x8_t __p0, poly8x8_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx1_p8(poly8x8_t __p0, poly8x8_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx1_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx1_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx1_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx1_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx1_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx2_p8(poly8x8_t __p0, poly8x8x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx2_p8(poly8x8_t __p0, poly8x8x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx2_u8(uint8x8_t __p0, uint8x8x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx2_u8(uint8x8_t __p0, uint8x8x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx2_s8(int8x8_t __p0, int8x8x2_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx2_s8(int8x8_t __p0, int8x8x2_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx2_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx3_p8(poly8x8_t __p0, poly8x8x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx3_p8(poly8x8_t __p0, poly8x8x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx3_u8(uint8x8_t __p0, uint8x8x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx3_u8(uint8x8_t __p0, uint8x8x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx3_s8(int8x8_t __p0, int8x8x3_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx3_s8(int8x8_t __p0, int8x8x3_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx3_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtbx4_p8(poly8x8_t __p0, poly8x8x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtbx4_p8(poly8x8_t __p0, poly8x8x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtbx4_u8(uint8x8_t __p0, uint8x8x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtbx4_u8(uint8x8_t __p0, uint8x8x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtbx4_s8(int8x8_t __p0, int8x8x4_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__p0, (int8x8_t)__p1.val[0], (int8x8_t)__p1.val[1], (int8x8_t)__p1.val[2], (int8x8_t)__p1.val[3], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vtbx4_s8(int8x8_t __p0, int8x8x4_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vtbx4_v((int8x8_t)__rev0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8x2_t vtrn_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8x2_t vtrn_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4x2_t vtrn_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4x2_t vtrn_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16x2_t vtrnq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16x2_t vtrnq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8x2_t vtrnq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8x2_t vtrnq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16x2_t vtrnq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16x2_t vtrnq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4x2_t vtrnq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4x2_t vtrnq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8x2_t vtrnq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8x2_t vtrnq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16x2_t vtrnq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16x2_t vtrnq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4x2_t vtrnq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4x2_t vtrnq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4x2_t vtrnq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4x2_t vtrnq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8x2_t vtrnq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8x2_t vtrnq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8x2_t __ret;
+  __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8x2_t vtrn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8x2_t vtrn_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2x2_t vtrn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2x2_t vtrn_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4x2_t vtrn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4x2_t vtrn_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2x2_t vtrn_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2x2_t vtrn_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2x2_t vtrn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2x2_t vtrn_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4x2_t vtrn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4x2_t vtrn_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4x2_t __ret;
+  __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtst_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtst_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtst_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtst_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtstq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtstq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtstq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtstq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtstq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtstq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtstq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtstq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtstq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtstq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtstq_s8(int8x16_t __p0, int8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtstq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtstq_s32(int32x4_t __p0, int32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtstq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtstq_s16(int16x8_t __p0, int16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtstq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtst_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtst_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtst_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtst_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtst_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtst_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtst_s8(int8x8_t __p0, int8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtst_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtst_s32(int32x2_t __p0, int32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtst_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtst_s16(int16x4_t __p0, int16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtst_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vtst_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8x2_t vuzp_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8x2_t vuzp_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4x2_t vuzp_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4x2_t vuzp_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16x2_t vuzpq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16x2_t vuzpq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8x2_t vuzpq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8x2_t vuzpq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16x2_t vuzpq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16x2_t vuzpq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4x2_t vuzpq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4x2_t vuzpq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8x2_t vuzpq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8x2_t vuzpq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16x2_t vuzpq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16x2_t vuzpq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4x2_t vuzpq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4x2_t vuzpq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4x2_t vuzpq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4x2_t vuzpq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8x2_t vuzpq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8x2_t vuzpq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8x2_t __ret;
+  __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8x2_t vuzp_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8x2_t vuzp_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2x2_t vuzp_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2x2_t vuzp_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4x2_t vuzp_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4x2_t vuzp_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8x2_t vuzp_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8x2_t vuzp_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2x2_t vuzp_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2x2_t vuzp_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2x2_t vuzp_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2x2_t vuzp_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4x2_t vuzp_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4x2_t vuzp_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4x2_t __ret;
+  __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8x2_t vzip_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8x2_t vzip_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 4);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4x2_t vzip_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4x2_t vzip_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 5);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16x2_t vzipq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16x2_t vzipq_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8x2_t vzipq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 37);
+  return __ret;
+}
+#else
+__ai poly16x8x2_t vzipq_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 37);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16x2_t vzipq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16x2_t vzipq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4x2_t vzipq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4x2_t vzipq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8x2_t vzipq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8x2_t vzipq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16x2_t vzipq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16x2_t vzipq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4x2_t vzipq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4x2_t vzipq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4x2_t vzipq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4x2_t vzipq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8x2_t vzipq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8x2_t vzipq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8x2_t __ret;
+  __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8x2_t vzip_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8x2_t vzip_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2x2_t vzip_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2x2_t vzip_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4x2_t vzip_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4x2_t vzip_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8x2_t vzip_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8x2_t vzip_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2x2_t vzip_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2x2_t vzip_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2x2_t vzip_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2x2_t vzip_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4x2_t vzip_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4x2_t vzip_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4x2_t __ret;
+  __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0);
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#if !defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#endif
+#if (__ARM_FP & 2)
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
+  return __ret;
+}
+#else
+__ai float16x4_t vcvt_f16_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__rev0, 8);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float16x4_t __noswap_vcvt_f16_f32(float32x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t) __builtin_neon_vcvt_f16_f32((int8x16_t)__p0, 8);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vcvt_f32_f16(float16x4_t __p0) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vcvt_f32_f16(float16x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vcvt_f32_f16((int8x8_t)__p0, 41);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtaq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtaq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtaq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtaq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvta_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvta_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvta_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvta_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtaq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtaq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtaq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtaq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvta_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvta_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvta_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvta_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtmq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtmq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtmq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtmq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvtm_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtm_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvtm_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtm_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtmq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtmq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtmq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtmq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvtm_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtm_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvtm_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtm_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtnq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtnq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtnq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtnq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvtn_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtn_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvtn_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtn_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtnq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtnq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtnq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtnq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvtn_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtn_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvtn_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtn_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vcvtpq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtpq_s32_v((int8x16_t)__p0, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vcvtpq_s32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vcvtpq_s32_v((int8x16_t)__rev0, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vcvtp_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtp_s32_v((int8x8_t)__p0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vcvtp_s32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vcvtp_s32_v((int8x8_t)__rev0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcvtpq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtpq_u32_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcvtpq_u32_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcvtpq_u32_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcvtp_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtp_u32_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcvtp_u32_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcvtp_u32_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrnd_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrnd_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndaq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndaq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrnda_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrnda_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndmq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndmq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndm_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndm_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndnq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndnq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndn_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndn_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndpq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndpq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndp_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndp_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndxq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndxq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndx_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndx_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtaq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtaq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtaq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtaq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvta_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvta_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvta_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvta_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtaq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtaq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtaq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtaq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvta_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvta_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvta_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvta_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtmq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtmq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtmq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtmq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvtm_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtm_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvtm_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtm_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtmq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtmq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtmq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtmq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvtm_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtm_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvtm_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtm_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtnq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtnq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtnq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtnq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvtn_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtn_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvtn_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtn_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtnq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtnq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtnq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtnq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvtn_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtn_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvtn_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtn_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtpq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtpq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtpq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtpq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvtp_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtp_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvtp_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvtp_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtpq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtpq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtpq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtpq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvtp_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtp_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvtp_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvtp_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_p64(poly64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_p64(poly64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s8(int8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f64(float64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f64(float64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f32(float32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_f16(float16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s32(int32x2_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s64(int64x1_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_p8(poly8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_p8(poly8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_p16(poly16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_p16(poly16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u8(uint8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u8(uint8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u32(uint32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u32(uint32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u64(uint64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u64(uint64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_u16(uint16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_u16(uint16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s8(int8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s8(int8x8_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_f64(float64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_f64(float64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_f32(float32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_f32(float32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_f16(float16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_f16(float16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s32(int32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s32(int32x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s64(int64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s64(int64x1_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vreinterpret_p64_s16(int16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vreinterpret_p64_s16(int16x4_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_p64(poly64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_p64(poly64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s8(int8x8_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f64(float64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f64(float64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f32(float32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_f16(float16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s32(int32x2_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s64(int64x1_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) {
+  poly16x4_t __ret;
+  __ret = (poly16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p128(poly128_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p128(poly128_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p64(poly64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p64(poly64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f64(float64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f64(float64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_p8(poly8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_p8(poly8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_p64(poly64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_p64(poly64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_p16(poly16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_p16(poly16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u8(uint8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u8(uint8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u32(uint32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u32(uint32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u64(uint64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u64(uint64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_u16(uint16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_u16(uint16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s8(int8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s8(int8x16_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_f64(float64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_f64(float64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_f32(float32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_f32(float32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_f16(float16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_f16(float16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s32(int32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s32(int32x4_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s64(int64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s64(int64x2_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vreinterpretq_p128_s16(int16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly128_t vreinterpretq_p128_s16(int16x8_t __p0) {
+  poly128_t __ret;
+  __ret = (poly128_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_p8(poly8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_p8(poly8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_p128(poly128_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_p128(poly128_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_p16(poly16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_p16(poly16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u8(uint8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u8(uint8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u32(uint32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u32(uint32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u64(uint64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u64(uint64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_u16(uint16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_u16(uint16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s8(int8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s8(int8x16_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_f64(float64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_f64(float64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_f32(float32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_f32(float32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_f16(float16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_f16(float16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s32(int32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s32(int32x4_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s64(int64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s64(int64x2_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vreinterpretq_p64_s16(int16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x2_t vreinterpretq_p64_s16(int16x8_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p128(poly128_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p128(poly128_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_p64(poly64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_p64(poly64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f64(float64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f64(float64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) {
+  poly16x8_t __ret;
+  __ret = (poly16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p128(poly128_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p128(poly128_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p64(poly64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p64(poly64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f64(float64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f64(float64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p128(poly128_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p128(poly128_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p64(poly64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p64(poly64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f64(float64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f64(float64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p128(poly128_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p128(poly128_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p64(poly64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p64(poly64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p128(poly128_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p128(poly128_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p64(poly64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p64(poly64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f64(float64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f64(float64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p128(poly128_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p128(poly128_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p64(poly64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p64(poly64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f64(float64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f64(float64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f32(float32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_f16(float16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s32(int32x4_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s64(int64x2_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p8(poly8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p8(poly8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p128(poly128_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p128(poly128_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p64(poly64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p64(poly64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_p16(poly16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_p16(poly16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u8(uint8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u8(uint8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u32(uint32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u32(uint32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u64(uint64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u64(uint64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_u16(uint16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_u16(uint16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s8(int8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s8(int8x16_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_f32(float32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_f32(float32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_f16(float16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_f16(float16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s32(int32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s32(int32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s64(int64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s64(int64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vreinterpretq_f64_s16(int16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x2_t vreinterpretq_f64_s16(int16x8_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p128(poly128_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p128(poly128_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p64(poly64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p64(poly64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s8(int8x16_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_f64(float64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_f64(float64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s32(int32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s64(int64x2_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p128(poly128_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p128(poly128_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p64(poly64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p64(poly64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s8(int8x16_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_f64(float64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_f64(float64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_f32(float32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s32(int32x4_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s64(int64x2_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) {
+  float16x8_t __ret;
+  __ret = (float16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p128(poly128_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p128(poly128_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p64(poly64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p64(poly64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s8(int8x16_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f64(float64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f64(float64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f32(float32x4_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_f16(float16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s64(int64x2_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p128(poly128_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p128(poly128_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p64(poly64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p64(poly64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s8(int8x16_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f32(float32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_f16(float16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s32(int32x4_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p128(poly128_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p128(poly128_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p64(poly64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p64(poly64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s8(int8x16_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f64(float64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f64(float64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f32(float32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_f16(float16x8_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s32(int32x4_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x8_t vreinterpretq_s16_s64(int64x2_t __p0) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p64(poly64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p64(poly64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f64(float64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f64(float64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f32(float32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_f16(float16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s32(int32x2_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s64(int64x1_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p64(poly64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p64(poly64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s8(int8x8_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f64(float64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f64(float64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_f16(float16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s64(int64x1_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s8(int8x8_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f32(float32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_f16(float16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s32(int32x2_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p64(poly64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p64(poly64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s8(int8x8_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f64(float64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f64(float64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f32(float32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_f16(float16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s32(int32x2_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s64(int64x1_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p8(poly8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p64(poly64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p64(poly64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_p16(poly16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u8(uint8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u32(uint32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u64(uint64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_u16(uint16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f64(float64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f64(float64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f32(float32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_f16(float16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s32(int32x2_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s64(int64x1_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#else
+__ai int8x8_t vreinterpret_s8_s16(int16x4_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_p8(poly8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_p8(poly8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_p64(poly64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_p64(poly64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_p16(poly16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_p16(poly16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u8(uint8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u8(uint8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u32(uint32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u32(uint32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_u16(uint16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_u16(uint16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s8(int8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s8(int8x8_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_f32(float32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_f32(float32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_f16(float16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_f16(float16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s32(int32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s32(int32x2_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vreinterpret_f64_s16(int16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vreinterpret_f64_s16(int16x4_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p8(poly8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p64(poly64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p64(poly64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_p16(poly16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u8(uint8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u32(uint32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u64(uint64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_u16(uint16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s8(int8x8_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_f64(float64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_f64(float64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_f16(float16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s32(int32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s64(int64x1_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai float32x2_t vreinterpret_f32_s16(int16x4_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p8(poly8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p64(poly64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p64(poly64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_p16(poly16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u8(uint8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u32(uint32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u64(uint64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_u16(uint16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s8(int8x8_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_f64(float64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_f64(float64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_f32(float32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s32(int32x2_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s64(int64x1_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai float16x4_t vreinterpret_f16_s16(int16x4_t __p0) {
+  float16x4_t __ret;
+  __ret = (float16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p8(poly8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p64(poly64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p64(poly64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_p16(poly16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u8(uint8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u32(uint32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u64(uint64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_u16(uint16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s8(int8x8_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f64(float64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f64(float64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f32(float32x2_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_f16(float16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s64(int64x1_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#else
+__ai int32x2_t vreinterpret_s32_s16(int16x4_t __p0) {
+  int32x2_t __ret;
+  __ret = (int32x2_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p8(poly8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p64(poly64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p64(poly64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_p16(poly16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u8(uint8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u32(uint32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u64(uint64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_u16(uint16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s8(int8x8_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f32(float32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_f16(float16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s32(int32x2_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai int64x1_t vreinterpret_s64_s16(int16x4_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p64(poly64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p64(poly64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_p16(poly16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u8(uint8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u32(uint32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u64(uint64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_u16(uint16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s8(int8x8_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f64(float64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f64(float64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f32(float32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_f16(float16x4_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s32(int32x2_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#else
+__ai int16x4_t vreinterpret_s16_s64(int64x1_t __p0) {
+  int16x4_t __ret;
+  __ret = (int16x4_t)(__p0);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__aarch64__) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrnd_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrnd_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndaq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndaq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrnda_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrnda_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndiq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndiq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vrndiq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndi_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndi_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vrndi_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vrndi_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndmq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndmq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndm_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndm_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndnq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndnq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndn_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndn_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndpq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndpq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndp_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndp_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrndxq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrndxq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrndx_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrndx_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_ARCH >= 8 && defined(__aarch64__) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmaxnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmaxnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vminnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vminnm_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#endif
+#if __ARM_FEATURE_CRYPTO
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vsha1h_u32(uint32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vsha1h_u32(uint32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vfmaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __ret;
+  __ret = vfmaq_f32(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __ret;
+  __ret = vfma_f32(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float32x2_t __ret;
+  __ret = __noswap_vfma_f32(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = vqaddq_s32(__p0, vqrdmulhq_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqaddq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmlahq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = vqaddq_s16(__p0, vqrdmulhq_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmlahq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vqaddq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmlah_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = vqadd_s32(__p0, vqrdmulh_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmlah_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __noswap_vqadd_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = vqadd_s16(__p0, vqrdmulh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmlah_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __noswap_vqadd_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqaddq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqaddq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqaddq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqaddq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqadd_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqadd_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqadd_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqadd_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqrdmlshq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = vqsubq_s32(__p0, vqrdmulhq_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqrdmlshq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqsubq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqrdmlshq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = vqsubq_s16(__p0, vqrdmulhq_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vqrdmlshq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vqsubq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vqrdmlsh_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = vqsub_s32(__p0, vqrdmulh_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x2_t vqrdmlsh_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __noswap_vqsub_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = vqsub_s16(__p0, vqrdmulh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x4_t vqrdmlsh_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __noswap_vqsub_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqsubq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqsubq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqsubq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqsubq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqsub_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqsub_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqsub_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqsub_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#endif
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqaddq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqaddq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqaddq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlahq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqaddq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqadd_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqadd_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlah_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqadd_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlah_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqadd_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqsubq_s32(__s0, vqrdmulhq_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqsubq_s32(__rev0, __noswap_vqrdmulhq_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = vqsubq_s16(__s0, vqrdmulhq_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlshq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqsubq_s16(__rev0, __noswap_vqrdmulhq_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = vqsub_s32(__s0, vqrdmulh_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqsub_s32(__rev0, __noswap_vqrdmulh_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlsh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = vqsub_s16(__s0, vqrdmulh_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3))); \
+  __ret; \
+})
+#else
+#define vqrdmlsh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqsub_s16(__rev0, __noswap_vqrdmulh_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3))); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#endif
+#if defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vabdq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vabdq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vabd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vabd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vabdd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vabdd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vabdd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vabdd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vabds_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vabds_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vabds_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vabds_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vabsq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vabsq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabsq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vabsq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vabs_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vabs_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vabsd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vabsd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 + __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vadd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vadd_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vaddhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vaddhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vaddhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vaddhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vaddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vaddhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vaddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vaddhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vaddhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vaddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vaddhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vaddhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vaddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vaddhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vaddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vaddhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vaddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vaddhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddlvq_u8(uint8x16_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddlvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddlvq_u32(uint32x4_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vaddlvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddlvq_u16(uint16x8_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddlvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddlvq_s8(int8x16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddlvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddlvq_s32(int32x4_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vaddlvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddlvq_s16(int16x8_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddlvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddlv_u8(uint8x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddlv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddlv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddlv_u32(uint32x2_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vaddlv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddlv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddlv_u16(uint16x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddlv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddlv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddlv_s8(int8x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddlv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddlv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddlv_s32(int32x2_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vaddlv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddlv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddlv_s16(int16x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddlv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddlv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vaddvq_u8(uint8x16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vaddvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddvq_u32(uint32x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vaddvq_u64(uint64x2_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddvq_u64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vaddvq_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vaddvq_u64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddvq_u16(uint16x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vaddvq_s8(int8x16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vaddvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vaddvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vaddvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vaddvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vaddvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vaddvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vaddvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddvq_s32(int32x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vaddvq_s64(int64x2_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddvq_s64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vaddvq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vaddvq_s64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddvq_s16(int16x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vaddv_u8(uint8x8_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vaddv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vaddv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vaddv_u32(uint32x2_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vaddv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vaddv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vaddv_u16(uint16x4_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vaddv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vaddv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vaddv_s8(int8x8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vaddv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vaddv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vaddv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vaddv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vaddv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vaddv_s32(int32x2_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vaddv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vaddv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vaddv_s16(int16x4_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vaddv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vaddv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vbsl_p64(uint64x1_t __p0, poly64x1_t __p1, poly64x1_t __p2) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 6);
+  return __ret;
+}
+#else
+__ai poly64x1_t vbsl_p64(uint64x1_t __p0, poly64x1_t __p1, poly64x1_t __p2) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 6);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vbslq_p64(uint64x2_t __p0, poly64x2_t __p1, poly64x2_t __p2) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 38);
+  return __ret;
+}
+#else
+__ai poly64x2_t vbslq_p64(uint64x2_t __p0, poly64x2_t __p1, poly64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 38);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vbslq_f64(uint64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vbslq_f64(uint64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vbsl_f64(uint64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vbsl_f64(uint64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcageq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcageq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcage_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcage_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcaged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaged_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcaged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaged_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcages_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcages_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcages_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcages_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcagtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcagtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcagt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcagt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcagtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcagtd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcagtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcagtd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcagts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcagts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcagts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcagts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcaleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcaleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcale_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcale_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcaled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaled_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcaled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaled_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcales_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcales_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcales_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcales_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcaltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcaltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcalt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcalt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcaltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaltd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcaltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcaltd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcalts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcalts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcalts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcalts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 == __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceq_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceq_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 == __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vceqd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vceqd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vceqd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vceqs_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqs_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vceqs_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqs_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceqz_p8(poly8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceqz_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_p64(poly64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceqz_p16(poly16x4_t __p0) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqzq_p8(poly8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqzq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_p64(poly64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_p64(poly64x2_t __p0) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqzq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqzq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqzq_u32(uint32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqzq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_u64(uint64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqzq_u16(uint16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqzq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vceqzq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vceqzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqzq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqzq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vceqzq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vceqzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vceqzq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vceqzq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vceqzq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vceqzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceqz_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceqz_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceqz_u32(uint32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceqz_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_u64(uint64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_u64(uint64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceqz_u16(uint16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceqz_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vceqz_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vceqz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceqz_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceqz_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vceqz_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vceqz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vceqz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vceqz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vceqz_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vceqz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqzd_u64(uint64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vceqzd_u64(uint64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vceqzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqzd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vceqzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vceqzd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vceqzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vceqzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vceqzd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vceqzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqzs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vceqzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vceqzs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgeq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgeq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgeq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgeq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgeq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgeq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 >= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcge_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcge_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcge_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcge_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 >= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcged_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcged_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcged_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcged_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcged_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcged_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcges_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcges_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcges_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcges_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgezq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgezq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgezq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgezq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgezq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgezq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgezq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgezq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgezq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgezq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgezq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgezq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgez_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgez_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgez_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgez_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgez_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgez_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgez_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgez_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcgezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgezd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcgezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgezd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgezd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcgezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgezd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcgezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgezs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcgezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgezs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 > __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 > __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcgtd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcgtd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcgts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcgts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcgtzq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcgtzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtzq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtzq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtzq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtzq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcgtzq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcgtzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcgtzq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcgtzq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcgtzq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcgtzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcgtz_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcgtz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgtz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgtz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgtz_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgtz_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcgtz_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcgtz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcgtz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcgtz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcgtz_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcgtz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcgtzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtzd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcgtzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcgtzd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcgtzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtzd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcgtzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcgtzd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcgtzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgtzs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcgtzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcgtzs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcleq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcleq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcleq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcleq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcleq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 <= __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcle_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcle_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcle_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcle_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcle_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcle_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 <= __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcled_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcled_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcled_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcled_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcled_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcled_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcles_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcles_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vcles_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcles_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vclezq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vclezq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vclezq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vclezq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vclezq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vclezq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vclezq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vclezq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vclezq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vclezq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vclezq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vclezq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vclez_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclez_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vclez_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclez_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclez_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclez_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vclez_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vclez_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclez_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vclez_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vclez_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vclez_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vclezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vclezd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vclezd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vclezd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vclezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vclezd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vclezd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vclezd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vclezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclezs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vclezs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclezs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__rev0 < __rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclt_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclt_f64(float64x1_t __p0, float64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vclt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#else
+__ai uint64x1_t vclt_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t)(__p0 < __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcltd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vcltd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vcltd_f64(float64_t __p0, float64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vclts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclts_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vclts_f32(float32_t __p0, float32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vclts_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vcltzq_s8(int8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vcltzq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltzq_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltzq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltzq_f32(float32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltzq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vcltzq_s32(int32x4_t __p0) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vcltzq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcltzq_s64(int64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcltzq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vcltzq_s16(int16x8_t __p0) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vcltzq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vcltz_s8(int8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vcltz_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcltz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcltz_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcltz_f32(float32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcltz_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vcltz_s32(int32x2_t __p0) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vcltz_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcltz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcltz_s64(int64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vcltz_s16(int16x4_t __p0) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vcltz_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcltzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltzd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcltzd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcltzd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcltzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltzd_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcltzd_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcltzd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcltzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcltzs_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcltzs_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcltzs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vcombine_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai poly64x2_t vcombine_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  return __ret;
+}
+#else
+__ai float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_p8(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \
+  poly8x16_t __s0_0 = __p0_0; \
+  poly8x8_t __s2_0 = __p2_0; \
+  poly8x16_t __ret_0; \
+  __ret_0 = vsetq_lane_p8(vget_lane_p8(__s2_0, __p3_0), __s0_0, __p1_0); \
+  __ret_0; \
+})
+#else
+#define vcopyq_lane_p8(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \
+  poly8x16_t __s0_1 = __p0_1; \
+  poly8x8_t __s2_1 = __p2_1; \
+  poly8x16_t __rev0_1;  __rev0_1 = __builtin_shufflevector(__s0_1, __s0_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_1;  __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_1; \
+  __ret_1 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_1, __p3_1), __rev0_1, __p1_1); \
+  __ret_1 = __builtin_shufflevector(__ret_1, __ret_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_1; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_p16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \
+  poly16x8_t __s0_2 = __p0_2; \
+  poly16x4_t __s2_2 = __p2_2; \
+  poly16x8_t __ret_2; \
+  __ret_2 = vsetq_lane_p16(vget_lane_p16(__s2_2, __p3_2), __s0_2, __p1_2); \
+  __ret_2; \
+})
+#else
+#define vcopyq_lane_p16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \
+  poly16x8_t __s0_3 = __p0_3; \
+  poly16x4_t __s2_3 = __p2_3; \
+  poly16x8_t __rev0_3;  __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __rev2_3;  __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \
+  poly16x8_t __ret_3; \
+  __ret_3 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_3, __p3_3), __rev0_3, __p1_3); \
+  __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_3; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u8(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \
+  uint8x16_t __s0_4 = __p0_4; \
+  uint8x8_t __s2_4 = __p2_4; \
+  uint8x16_t __ret_4; \
+  __ret_4 = vsetq_lane_u8(vget_lane_u8(__s2_4, __p3_4), __s0_4, __p1_4); \
+  __ret_4; \
+})
+#else
+#define vcopyq_lane_u8(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \
+  uint8x16_t __s0_5 = __p0_5; \
+  uint8x8_t __s2_5 = __p2_5; \
+  uint8x16_t __rev0_5;  __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_5;  __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_5; \
+  __ret_5 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_5, __p3_5), __rev0_5, __p1_5); \
+  __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_5; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u32(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \
+  uint32x4_t __s0_6 = __p0_6; \
+  uint32x2_t __s2_6 = __p2_6; \
+  uint32x4_t __ret_6; \
+  __ret_6 = vsetq_lane_u32(vget_lane_u32(__s2_6, __p3_6), __s0_6, __p1_6); \
+  __ret_6; \
+})
+#else
+#define vcopyq_lane_u32(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \
+  uint32x4_t __s0_7 = __p0_7; \
+  uint32x2_t __s2_7 = __p2_7; \
+  uint32x4_t __rev0_7;  __rev0_7 = __builtin_shufflevector(__s0_7, __s0_7, 3, 2, 1, 0); \
+  uint32x2_t __rev2_7;  __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 1, 0); \
+  uint32x4_t __ret_7; \
+  __ret_7 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_7, __p3_7), __rev0_7, __p1_7); \
+  __ret_7 = __builtin_shufflevector(__ret_7, __ret_7, 3, 2, 1, 0); \
+  __ret_7; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u64(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \
+  uint64x2_t __s0_8 = __p0_8; \
+  uint64x1_t __s2_8 = __p2_8; \
+  uint64x2_t __ret_8; \
+  __ret_8 = vsetq_lane_u64(vget_lane_u64(__s2_8, __p3_8), __s0_8, __p1_8); \
+  __ret_8; \
+})
+#else
+#define vcopyq_lane_u64(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \
+  uint64x2_t __s0_9 = __p0_9; \
+  uint64x1_t __s2_9 = __p2_9; \
+  uint64x2_t __rev0_9;  __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 1, 0); \
+  uint64x2_t __ret_9; \
+  __ret_9 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_9, __p3_9), __rev0_9, __p1_9); \
+  __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 1, 0); \
+  __ret_9; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_u16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \
+  uint16x8_t __s0_10 = __p0_10; \
+  uint16x4_t __s2_10 = __p2_10; \
+  uint16x8_t __ret_10; \
+  __ret_10 = vsetq_lane_u16(vget_lane_u16(__s2_10, __p3_10), __s0_10, __p1_10); \
+  __ret_10; \
+})
+#else
+#define vcopyq_lane_u16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \
+  uint16x8_t __s0_11 = __p0_11; \
+  uint16x4_t __s2_11 = __p2_11; \
+  uint16x8_t __rev0_11;  __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2_11;  __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 3, 2, 1, 0); \
+  uint16x8_t __ret_11; \
+  __ret_11 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_11, __p3_11), __rev0_11, __p1_11); \
+  __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_11; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \
+  int8x16_t __s0_12 = __p0_12; \
+  int8x8_t __s2_12 = __p2_12; \
+  int8x16_t __ret_12; \
+  __ret_12 = vsetq_lane_s8(vget_lane_s8(__s2_12, __p3_12), __s0_12, __p1_12); \
+  __ret_12; \
+})
+#else
+#define vcopyq_lane_s8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \
+  int8x16_t __s0_13 = __p0_13; \
+  int8x8_t __s2_13 = __p2_13; \
+  int8x16_t __rev0_13;  __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_13;  __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_13; \
+  __ret_13 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_13, __p3_13), __rev0_13, __p1_13); \
+  __ret_13 = __builtin_shufflevector(__ret_13, __ret_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_13; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_f32(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \
+  float32x4_t __s0_14 = __p0_14; \
+  float32x2_t __s2_14 = __p2_14; \
+  float32x4_t __ret_14; \
+  __ret_14 = vsetq_lane_f32(vget_lane_f32(__s2_14, __p3_14), __s0_14, __p1_14); \
+  __ret_14; \
+})
+#else
+#define vcopyq_lane_f32(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \
+  float32x4_t __s0_15 = __p0_15; \
+  float32x2_t __s2_15 = __p2_15; \
+  float32x4_t __rev0_15;  __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 3, 2, 1, 0); \
+  float32x2_t __rev2_15;  __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 1, 0); \
+  float32x4_t __ret_15; \
+  __ret_15 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_15, __p3_15), __rev0_15, __p1_15); \
+  __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 3, 2, 1, 0); \
+  __ret_15; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s32(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \
+  int32x4_t __s0_16 = __p0_16; \
+  int32x2_t __s2_16 = __p2_16; \
+  int32x4_t __ret_16; \
+  __ret_16 = vsetq_lane_s32(vget_lane_s32(__s2_16, __p3_16), __s0_16, __p1_16); \
+  __ret_16; \
+})
+#else
+#define vcopyq_lane_s32(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \
+  int32x4_t __s0_17 = __p0_17; \
+  int32x2_t __s2_17 = __p2_17; \
+  int32x4_t __rev0_17;  __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 3, 2, 1, 0); \
+  int32x2_t __rev2_17;  __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 1, 0); \
+  int32x4_t __ret_17; \
+  __ret_17 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_17, __p3_17), __rev0_17, __p1_17); \
+  __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 3, 2, 1, 0); \
+  __ret_17; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s64(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \
+  int64x2_t __s0_18 = __p0_18; \
+  int64x1_t __s2_18 = __p2_18; \
+  int64x2_t __ret_18; \
+  __ret_18 = vsetq_lane_s64(vget_lane_s64(__s2_18, __p3_18), __s0_18, __p1_18); \
+  __ret_18; \
+})
+#else
+#define vcopyq_lane_s64(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \
+  int64x2_t __s0_19 = __p0_19; \
+  int64x1_t __s2_19 = __p2_19; \
+  int64x2_t __rev0_19;  __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 1, 0); \
+  int64x2_t __ret_19; \
+  __ret_19 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_19, __p3_19), __rev0_19, __p1_19); \
+  __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 1, 0); \
+  __ret_19; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_s16(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \
+  int16x8_t __s0_20 = __p0_20; \
+  int16x4_t __s2_20 = __p2_20; \
+  int16x8_t __ret_20; \
+  __ret_20 = vsetq_lane_s16(vget_lane_s16(__s2_20, __p3_20), __s0_20, __p1_20); \
+  __ret_20; \
+})
+#else
+#define vcopyq_lane_s16(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \
+  int16x8_t __s0_21 = __p0_21; \
+  int16x4_t __s2_21 = __p2_21; \
+  int16x8_t __rev0_21;  __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2_21;  __rev2_21 = __builtin_shufflevector(__s2_21, __s2_21, 3, 2, 1, 0); \
+  int16x8_t __ret_21; \
+  __ret_21 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_21, __p3_21), __rev0_21, __p1_21); \
+  __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_21; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_p8(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \
+  poly8x8_t __s0_22 = __p0_22; \
+  poly8x8_t __s2_22 = __p2_22; \
+  poly8x8_t __ret_22; \
+  __ret_22 = vset_lane_p8(vget_lane_p8(__s2_22, __p3_22), __s0_22, __p1_22); \
+  __ret_22; \
+})
+#else
+#define vcopy_lane_p8(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \
+  poly8x8_t __s0_23 = __p0_23; \
+  poly8x8_t __s2_23 = __p2_23; \
+  poly8x8_t __rev0_23;  __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __rev2_23;  __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_23; \
+  __ret_23 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_23, __p3_23), __rev0_23, __p1_23); \
+  __ret_23 = __builtin_shufflevector(__ret_23, __ret_23, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_23; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_p16(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \
+  poly16x4_t __s0_24 = __p0_24; \
+  poly16x4_t __s2_24 = __p2_24; \
+  poly16x4_t __ret_24; \
+  __ret_24 = vset_lane_p16(vget_lane_p16(__s2_24, __p3_24), __s0_24, __p1_24); \
+  __ret_24; \
+})
+#else
+#define vcopy_lane_p16(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \
+  poly16x4_t __s0_25 = __p0_25; \
+  poly16x4_t __s2_25 = __p2_25; \
+  poly16x4_t __rev0_25;  __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 3, 2, 1, 0); \
+  poly16x4_t __rev2_25;  __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 3, 2, 1, 0); \
+  poly16x4_t __ret_25; \
+  __ret_25 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_25, __p3_25), __rev0_25, __p1_25); \
+  __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 3, 2, 1, 0); \
+  __ret_25; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u8(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \
+  uint8x8_t __s0_26 = __p0_26; \
+  uint8x8_t __s2_26 = __p2_26; \
+  uint8x8_t __ret_26; \
+  __ret_26 = vset_lane_u8(vget_lane_u8(__s2_26, __p3_26), __s0_26, __p1_26); \
+  __ret_26; \
+})
+#else
+#define vcopy_lane_u8(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \
+  uint8x8_t __s0_27 = __p0_27; \
+  uint8x8_t __s2_27 = __p2_27; \
+  uint8x8_t __rev0_27;  __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __rev2_27;  __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_27; \
+  __ret_27 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_27, __p3_27), __rev0_27, __p1_27); \
+  __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_27; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \
+  uint32x2_t __s0_28 = __p0_28; \
+  uint32x2_t __s2_28 = __p2_28; \
+  uint32x2_t __ret_28; \
+  __ret_28 = vset_lane_u32(vget_lane_u32(__s2_28, __p3_28), __s0_28, __p1_28); \
+  __ret_28; \
+})
+#else
+#define vcopy_lane_u32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \
+  uint32x2_t __s0_29 = __p0_29; \
+  uint32x2_t __s2_29 = __p2_29; \
+  uint32x2_t __rev0_29;  __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 1, 0); \
+  uint32x2_t __rev2_29;  __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \
+  uint32x2_t __ret_29; \
+  __ret_29 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_29, __p3_29), __rev0_29, __p1_29); \
+  __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 1, 0); \
+  __ret_29; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \
+  uint64x1_t __s0_30 = __p0_30; \
+  uint64x1_t __s2_30 = __p2_30; \
+  uint64x1_t __ret_30; \
+  __ret_30 = vset_lane_u64(vget_lane_u64(__s2_30, __p3_30), __s0_30, __p1_30); \
+  __ret_30; \
+})
+#else
+#define vcopy_lane_u64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \
+  uint64x1_t __s0_31 = __p0_31; \
+  uint64x1_t __s2_31 = __p2_31; \
+  uint64x1_t __ret_31; \
+  __ret_31 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_31, __p3_31), __s0_31, __p1_31); \
+  __ret_31; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_u16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \
+  uint16x4_t __s0_32 = __p0_32; \
+  uint16x4_t __s2_32 = __p2_32; \
+  uint16x4_t __ret_32; \
+  __ret_32 = vset_lane_u16(vget_lane_u16(__s2_32, __p3_32), __s0_32, __p1_32); \
+  __ret_32; \
+})
+#else
+#define vcopy_lane_u16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \
+  uint16x4_t __s0_33 = __p0_33; \
+  uint16x4_t __s2_33 = __p2_33; \
+  uint16x4_t __rev0_33;  __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 3, 2, 1, 0); \
+  uint16x4_t __rev2_33;  __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \
+  uint16x4_t __ret_33; \
+  __ret_33 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_33, __p3_33), __rev0_33, __p1_33); \
+  __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 3, 2, 1, 0); \
+  __ret_33; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \
+  int8x8_t __s0_34 = __p0_34; \
+  int8x8_t __s2_34 = __p2_34; \
+  int8x8_t __ret_34; \
+  __ret_34 = vset_lane_s8(vget_lane_s8(__s2_34, __p3_34), __s0_34, __p1_34); \
+  __ret_34; \
+})
+#else
+#define vcopy_lane_s8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \
+  int8x8_t __s0_35 = __p0_35; \
+  int8x8_t __s2_35 = __p2_35; \
+  int8x8_t __rev0_35;  __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __rev2_35;  __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_35; \
+  __ret_35 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_35, __p3_35), __rev0_35, __p1_35); \
+  __ret_35 = __builtin_shufflevector(__ret_35, __ret_35, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_35; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_f32(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \
+  float32x2_t __s0_36 = __p0_36; \
+  float32x2_t __s2_36 = __p2_36; \
+  float32x2_t __ret_36; \
+  __ret_36 = vset_lane_f32(vget_lane_f32(__s2_36, __p3_36), __s0_36, __p1_36); \
+  __ret_36; \
+})
+#else
+#define vcopy_lane_f32(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \
+  float32x2_t __s0_37 = __p0_37; \
+  float32x2_t __s2_37 = __p2_37; \
+  float32x2_t __rev0_37;  __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 1, 0); \
+  float32x2_t __rev2_37;  __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 1, 0); \
+  float32x2_t __ret_37; \
+  __ret_37 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_37, __p3_37), __rev0_37, __p1_37); \
+  __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 1, 0); \
+  __ret_37; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s32(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \
+  int32x2_t __s0_38 = __p0_38; \
+  int32x2_t __s2_38 = __p2_38; \
+  int32x2_t __ret_38; \
+  __ret_38 = vset_lane_s32(vget_lane_s32(__s2_38, __p3_38), __s0_38, __p1_38); \
+  __ret_38; \
+})
+#else
+#define vcopy_lane_s32(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \
+  int32x2_t __s0_39 = __p0_39; \
+  int32x2_t __s2_39 = __p2_39; \
+  int32x2_t __rev0_39;  __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 1, 0); \
+  int32x2_t __rev2_39;  __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 1, 0); \
+  int32x2_t __ret_39; \
+  __ret_39 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_39, __p3_39), __rev0_39, __p1_39); \
+  __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 1, 0); \
+  __ret_39; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s64(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \
+  int64x1_t __s0_40 = __p0_40; \
+  int64x1_t __s2_40 = __p2_40; \
+  int64x1_t __ret_40; \
+  __ret_40 = vset_lane_s64(vget_lane_s64(__s2_40, __p3_40), __s0_40, __p1_40); \
+  __ret_40; \
+})
+#else
+#define vcopy_lane_s64(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \
+  int64x1_t __s0_41 = __p0_41; \
+  int64x1_t __s2_41 = __p2_41; \
+  int64x1_t __ret_41; \
+  __ret_41 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_41, __p3_41), __s0_41, __p1_41); \
+  __ret_41; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_s16(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \
+  int16x4_t __s0_42 = __p0_42; \
+  int16x4_t __s2_42 = __p2_42; \
+  int16x4_t __ret_42; \
+  __ret_42 = vset_lane_s16(vget_lane_s16(__s2_42, __p3_42), __s0_42, __p1_42); \
+  __ret_42; \
+})
+#else
+#define vcopy_lane_s16(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \
+  int16x4_t __s0_43 = __p0_43; \
+  int16x4_t __s2_43 = __p2_43; \
+  int16x4_t __rev0_43;  __rev0_43 = __builtin_shufflevector(__s0_43, __s0_43, 3, 2, 1, 0); \
+  int16x4_t __rev2_43;  __rev2_43 = __builtin_shufflevector(__s2_43, __s2_43, 3, 2, 1, 0); \
+  int16x4_t __ret_43; \
+  __ret_43 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_43, __p3_43), __rev0_43, __p1_43); \
+  __ret_43 = __builtin_shufflevector(__ret_43, __ret_43, 3, 2, 1, 0); \
+  __ret_43; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_p8(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \
+  poly8x16_t __s0_44 = __p0_44; \
+  poly8x16_t __s2_44 = __p2_44; \
+  poly8x16_t __ret_44; \
+  __ret_44 = vsetq_lane_p8(vgetq_lane_p8(__s2_44, __p3_44), __s0_44, __p1_44); \
+  __ret_44; \
+})
+#else
+#define vcopyq_laneq_p8(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \
+  poly8x16_t __s0_45 = __p0_45; \
+  poly8x16_t __s2_45 = __p2_45; \
+  poly8x16_t __rev0_45;  __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_45;  __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret_45; \
+  __ret_45 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_45, __p3_45), __rev0_45, __p1_45); \
+  __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_45; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_p16(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \
+  poly16x8_t __s0_46 = __p0_46; \
+  poly16x8_t __s2_46 = __p2_46; \
+  poly16x8_t __ret_46; \
+  __ret_46 = vsetq_lane_p16(vgetq_lane_p16(__s2_46, __p3_46), __s0_46, __p1_46); \
+  __ret_46; \
+})
+#else
+#define vcopyq_laneq_p16(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \
+  poly16x8_t __s0_47 = __p0_47; \
+  poly16x8_t __s2_47 = __p2_47; \
+  poly16x8_t __rev0_47;  __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __rev2_47;  __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret_47; \
+  __ret_47 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_47, __p3_47), __rev0_47, __p1_47); \
+  __ret_47 = __builtin_shufflevector(__ret_47, __ret_47, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_47; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u8(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \
+  uint8x16_t __s0_48 = __p0_48; \
+  uint8x16_t __s2_48 = __p2_48; \
+  uint8x16_t __ret_48; \
+  __ret_48 = vsetq_lane_u8(vgetq_lane_u8(__s2_48, __p3_48), __s0_48, __p1_48); \
+  __ret_48; \
+})
+#else
+#define vcopyq_laneq_u8(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \
+  uint8x16_t __s0_49 = __p0_49; \
+  uint8x16_t __s2_49 = __p2_49; \
+  uint8x16_t __rev0_49;  __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_49;  __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_49; \
+  __ret_49 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_49, __p3_49), __rev0_49, __p1_49); \
+  __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_49; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \
+  uint32x4_t __s0_50 = __p0_50; \
+  uint32x4_t __s2_50 = __p2_50; \
+  uint32x4_t __ret_50; \
+  __ret_50 = vsetq_lane_u32(vgetq_lane_u32(__s2_50, __p3_50), __s0_50, __p1_50); \
+  __ret_50; \
+})
+#else
+#define vcopyq_laneq_u32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \
+  uint32x4_t __s0_51 = __p0_51; \
+  uint32x4_t __s2_51 = __p2_51; \
+  uint32x4_t __rev0_51;  __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 3, 2, 1, 0); \
+  uint32x4_t __rev2_51;  __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 3, 2, 1, 0); \
+  uint32x4_t __ret_51; \
+  __ret_51 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_51, __p3_51), __rev0_51, __p1_51); \
+  __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 3, 2, 1, 0); \
+  __ret_51; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \
+  uint64x2_t __s0_52 = __p0_52; \
+  uint64x2_t __s2_52 = __p2_52; \
+  uint64x2_t __ret_52; \
+  __ret_52 = vsetq_lane_u64(vgetq_lane_u64(__s2_52, __p3_52), __s0_52, __p1_52); \
+  __ret_52; \
+})
+#else
+#define vcopyq_laneq_u64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \
+  uint64x2_t __s0_53 = __p0_53; \
+  uint64x2_t __s2_53 = __p2_53; \
+  uint64x2_t __rev0_53;  __rev0_53 = __builtin_shufflevector(__s0_53, __s0_53, 1, 0); \
+  uint64x2_t __rev2_53;  __rev2_53 = __builtin_shufflevector(__s2_53, __s2_53, 1, 0); \
+  uint64x2_t __ret_53; \
+  __ret_53 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_53, __p3_53), __rev0_53, __p1_53); \
+  __ret_53 = __builtin_shufflevector(__ret_53, __ret_53, 1, 0); \
+  __ret_53; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_u16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \
+  uint16x8_t __s0_54 = __p0_54; \
+  uint16x8_t __s2_54 = __p2_54; \
+  uint16x8_t __ret_54; \
+  __ret_54 = vsetq_lane_u16(vgetq_lane_u16(__s2_54, __p3_54), __s0_54, __p1_54); \
+  __ret_54; \
+})
+#else
+#define vcopyq_laneq_u16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \
+  uint16x8_t __s0_55 = __p0_55; \
+  uint16x8_t __s2_55 = __p2_55; \
+  uint16x8_t __rev0_55;  __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2_55;  __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_55; \
+  __ret_55 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_55, __p3_55), __rev0_55, __p1_55); \
+  __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_55; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \
+  int8x16_t __s0_56 = __p0_56; \
+  int8x16_t __s2_56 = __p2_56; \
+  int8x16_t __ret_56; \
+  __ret_56 = vsetq_lane_s8(vgetq_lane_s8(__s2_56, __p3_56), __s0_56, __p1_56); \
+  __ret_56; \
+})
+#else
+#define vcopyq_laneq_s8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \
+  int8x16_t __s0_57 = __p0_57; \
+  int8x16_t __s2_57 = __p2_57; \
+  int8x16_t __rev0_57;  __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_57;  __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_57; \
+  __ret_57 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_57, __p3_57), __rev0_57, __p1_57); \
+  __ret_57 = __builtin_shufflevector(__ret_57, __ret_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_57; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_f32(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \
+  float32x4_t __s0_58 = __p0_58; \
+  float32x4_t __s2_58 = __p2_58; \
+  float32x4_t __ret_58; \
+  __ret_58 = vsetq_lane_f32(vgetq_lane_f32(__s2_58, __p3_58), __s0_58, __p1_58); \
+  __ret_58; \
+})
+#else
+#define vcopyq_laneq_f32(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \
+  float32x4_t __s0_59 = __p0_59; \
+  float32x4_t __s2_59 = __p2_59; \
+  float32x4_t __rev0_59;  __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 3, 2, 1, 0); \
+  float32x4_t __rev2_59;  __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 3, 2, 1, 0); \
+  float32x4_t __ret_59; \
+  __ret_59 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_59, __p3_59), __rev0_59, __p1_59); \
+  __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 3, 2, 1, 0); \
+  __ret_59; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s32(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \
+  int32x4_t __s0_60 = __p0_60; \
+  int32x4_t __s2_60 = __p2_60; \
+  int32x4_t __ret_60; \
+  __ret_60 = vsetq_lane_s32(vgetq_lane_s32(__s2_60, __p3_60), __s0_60, __p1_60); \
+  __ret_60; \
+})
+#else
+#define vcopyq_laneq_s32(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \
+  int32x4_t __s0_61 = __p0_61; \
+  int32x4_t __s2_61 = __p2_61; \
+  int32x4_t __rev0_61;  __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 3, 2, 1, 0); \
+  int32x4_t __rev2_61;  __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 3, 2, 1, 0); \
+  int32x4_t __ret_61; \
+  __ret_61 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_61, __p3_61), __rev0_61, __p1_61); \
+  __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 3, 2, 1, 0); \
+  __ret_61; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s64(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \
+  int64x2_t __s0_62 = __p0_62; \
+  int64x2_t __s2_62 = __p2_62; \
+  int64x2_t __ret_62; \
+  __ret_62 = vsetq_lane_s64(vgetq_lane_s64(__s2_62, __p3_62), __s0_62, __p1_62); \
+  __ret_62; \
+})
+#else
+#define vcopyq_laneq_s64(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \
+  int64x2_t __s0_63 = __p0_63; \
+  int64x2_t __s2_63 = __p2_63; \
+  int64x2_t __rev0_63;  __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 1, 0); \
+  int64x2_t __rev2_63;  __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 1, 0); \
+  int64x2_t __ret_63; \
+  __ret_63 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_63, __p3_63), __rev0_63, __p1_63); \
+  __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 1, 0); \
+  __ret_63; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_s16(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \
+  int16x8_t __s0_64 = __p0_64; \
+  int16x8_t __s2_64 = __p2_64; \
+  int16x8_t __ret_64; \
+  __ret_64 = vsetq_lane_s16(vgetq_lane_s16(__s2_64, __p3_64), __s0_64, __p1_64); \
+  __ret_64; \
+})
+#else
+#define vcopyq_laneq_s16(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \
+  int16x8_t __s0_65 = __p0_65; \
+  int16x8_t __s2_65 = __p2_65; \
+  int16x8_t __rev0_65;  __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2_65;  __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_65; \
+  __ret_65 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_65, __p3_65), __rev0_65, __p1_65); \
+  __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_65; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_p8(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \
+  poly8x8_t __s0_66 = __p0_66; \
+  poly8x16_t __s2_66 = __p2_66; \
+  poly8x8_t __ret_66; \
+  __ret_66 = vset_lane_p8(vgetq_lane_p8(__s2_66, __p3_66), __s0_66, __p1_66); \
+  __ret_66; \
+})
+#else
+#define vcopy_laneq_p8(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \
+  poly8x8_t __s0_67 = __p0_67; \
+  poly8x16_t __s2_67 = __p2_67; \
+  poly8x8_t __rev0_67;  __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __rev2_67;  __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret_67; \
+  __ret_67 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_67, __p3_67), __rev0_67, __p1_67); \
+  __ret_67 = __builtin_shufflevector(__ret_67, __ret_67, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_67; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_p16(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \
+  poly16x4_t __s0_68 = __p0_68; \
+  poly16x8_t __s2_68 = __p2_68; \
+  poly16x4_t __ret_68; \
+  __ret_68 = vset_lane_p16(vgetq_lane_p16(__s2_68, __p3_68), __s0_68, __p1_68); \
+  __ret_68; \
+})
+#else
+#define vcopy_laneq_p16(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \
+  poly16x4_t __s0_69 = __p0_69; \
+  poly16x8_t __s2_69 = __p2_69; \
+  poly16x4_t __rev0_69;  __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 3, 2, 1, 0); \
+  poly16x8_t __rev2_69;  __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __ret_69; \
+  __ret_69 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_69, __p3_69), __rev0_69, __p1_69); \
+  __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 3, 2, 1, 0); \
+  __ret_69; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u8(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \
+  uint8x8_t __s0_70 = __p0_70; \
+  uint8x16_t __s2_70 = __p2_70; \
+  uint8x8_t __ret_70; \
+  __ret_70 = vset_lane_u8(vgetq_lane_u8(__s2_70, __p3_70), __s0_70, __p1_70); \
+  __ret_70; \
+})
+#else
+#define vcopy_laneq_u8(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \
+  uint8x8_t __s0_71 = __p0_71; \
+  uint8x16_t __s2_71 = __p2_71; \
+  uint8x8_t __rev0_71;  __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __rev2_71;  __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret_71; \
+  __ret_71 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_71, __p3_71), __rev0_71, __p1_71); \
+  __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_71; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \
+  uint32x2_t __s0_72 = __p0_72; \
+  uint32x4_t __s2_72 = __p2_72; \
+  uint32x2_t __ret_72; \
+  __ret_72 = vset_lane_u32(vgetq_lane_u32(__s2_72, __p3_72), __s0_72, __p1_72); \
+  __ret_72; \
+})
+#else
+#define vcopy_laneq_u32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \
+  uint32x2_t __s0_73 = __p0_73; \
+  uint32x4_t __s2_73 = __p2_73; \
+  uint32x2_t __rev0_73;  __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 1, 0); \
+  uint32x4_t __rev2_73;  __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \
+  uint32x2_t __ret_73; \
+  __ret_73 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_73, __p3_73), __rev0_73, __p1_73); \
+  __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 1, 0); \
+  __ret_73; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \
+  uint64x1_t __s0_74 = __p0_74; \
+  uint64x2_t __s2_74 = __p2_74; \
+  uint64x1_t __ret_74; \
+  __ret_74 = vset_lane_u64(vgetq_lane_u64(__s2_74, __p3_74), __s0_74, __p1_74); \
+  __ret_74; \
+})
+#else
+#define vcopy_laneq_u64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \
+  uint64x1_t __s0_75 = __p0_75; \
+  uint64x2_t __s2_75 = __p2_75; \
+  uint64x2_t __rev2_75;  __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \
+  uint64x1_t __ret_75; \
+  __ret_75 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_75, __p3_75), __s0_75, __p1_75); \
+  __ret_75; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_u16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \
+  uint16x4_t __s0_76 = __p0_76; \
+  uint16x8_t __s2_76 = __p2_76; \
+  uint16x4_t __ret_76; \
+  __ret_76 = vset_lane_u16(vgetq_lane_u16(__s2_76, __p3_76), __s0_76, __p1_76); \
+  __ret_76; \
+})
+#else
+#define vcopy_laneq_u16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \
+  uint16x4_t __s0_77 = __p0_77; \
+  uint16x8_t __s2_77 = __p2_77; \
+  uint16x4_t __rev0_77;  __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 3, 2, 1, 0); \
+  uint16x8_t __rev2_77;  __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret_77; \
+  __ret_77 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_77, __p3_77), __rev0_77, __p1_77); \
+  __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 3, 2, 1, 0); \
+  __ret_77; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \
+  int8x8_t __s0_78 = __p0_78; \
+  int8x16_t __s2_78 = __p2_78; \
+  int8x8_t __ret_78; \
+  __ret_78 = vset_lane_s8(vgetq_lane_s8(__s2_78, __p3_78), __s0_78, __p1_78); \
+  __ret_78; \
+})
+#else
+#define vcopy_laneq_s8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \
+  int8x8_t __s0_79 = __p0_79; \
+  int8x16_t __s2_79 = __p2_79; \
+  int8x8_t __rev0_79;  __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __rev2_79;  __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret_79; \
+  __ret_79 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_79, __p3_79), __rev0_79, __p1_79); \
+  __ret_79 = __builtin_shufflevector(__ret_79, __ret_79, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_79; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_f32(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \
+  float32x2_t __s0_80 = __p0_80; \
+  float32x4_t __s2_80 = __p2_80; \
+  float32x2_t __ret_80; \
+  __ret_80 = vset_lane_f32(vgetq_lane_f32(__s2_80, __p3_80), __s0_80, __p1_80); \
+  __ret_80; \
+})
+#else
+#define vcopy_laneq_f32(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \
+  float32x2_t __s0_81 = __p0_81; \
+  float32x4_t __s2_81 = __p2_81; \
+  float32x2_t __rev0_81;  __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 1, 0); \
+  float32x4_t __rev2_81;  __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 3, 2, 1, 0); \
+  float32x2_t __ret_81; \
+  __ret_81 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_81, __p3_81), __rev0_81, __p1_81); \
+  __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 1, 0); \
+  __ret_81; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s32(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \
+  int32x2_t __s0_82 = __p0_82; \
+  int32x4_t __s2_82 = __p2_82; \
+  int32x2_t __ret_82; \
+  __ret_82 = vset_lane_s32(vgetq_lane_s32(__s2_82, __p3_82), __s0_82, __p1_82); \
+  __ret_82; \
+})
+#else
+#define vcopy_laneq_s32(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \
+  int32x2_t __s0_83 = __p0_83; \
+  int32x4_t __s2_83 = __p2_83; \
+  int32x2_t __rev0_83;  __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 1, 0); \
+  int32x4_t __rev2_83;  __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 3, 2, 1, 0); \
+  int32x2_t __ret_83; \
+  __ret_83 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_83, __p3_83), __rev0_83, __p1_83); \
+  __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 1, 0); \
+  __ret_83; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s64(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \
+  int64x1_t __s0_84 = __p0_84; \
+  int64x2_t __s2_84 = __p2_84; \
+  int64x1_t __ret_84; \
+  __ret_84 = vset_lane_s64(vgetq_lane_s64(__s2_84, __p3_84), __s0_84, __p1_84); \
+  __ret_84; \
+})
+#else
+#define vcopy_laneq_s64(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \
+  int64x1_t __s0_85 = __p0_85; \
+  int64x2_t __s2_85 = __p2_85; \
+  int64x2_t __rev2_85;  __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 1, 0); \
+  int64x1_t __ret_85; \
+  __ret_85 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_85, __p3_85), __s0_85, __p1_85); \
+  __ret_85; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_s16(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \
+  int16x4_t __s0_86 = __p0_86; \
+  int16x8_t __s2_86 = __p2_86; \
+  int16x4_t __ret_86; \
+  __ret_86 = vset_lane_s16(vgetq_lane_s16(__s2_86, __p3_86), __s0_86, __p1_86); \
+  __ret_86; \
+})
+#else
+#define vcopy_laneq_s16(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \
+  int16x4_t __s0_87 = __p0_87; \
+  int16x8_t __s2_87 = __p2_87; \
+  int16x4_t __rev0_87;  __rev0_87 = __builtin_shufflevector(__s0_87, __s0_87, 3, 2, 1, 0); \
+  int16x8_t __rev2_87;  __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret_87; \
+  __ret_87 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_87, __p3_87), __rev0_87, __p1_87); \
+  __ret_87 = __builtin_shufflevector(__ret_87, __ret_87, 3, 2, 1, 0); \
+  __ret_87; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vcreate_p64(uint64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vcreate_p64(uint64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vcreate_f64(uint64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#else
+__ai float64x1_t vcreate_f64(uint64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t)(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vcvts_f32_s32(int32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_s32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vcvts_f32_s32(int32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vcvts_f32_u32(uint32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_u32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vcvts_f32_u32(uint32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvts_f32_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvt_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvt_f32_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcvt_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvt_f32_f64((int8x16_t)__p0, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vcvtd_f64_s64(int64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_s64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vcvtd_f64_s64(int64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vcvtd_f64_u64(uint64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_u64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vcvtd_f64_u64(uint64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vcvtd_f64_u64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvtq_f64_u64(uint64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai float64x2_t vcvtq_f64_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvtq_f64_s64(int64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai float64x2_t vcvtq_f64_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvtq_f64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vcvt_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai float64x1_t vcvt_f64_u64(uint64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vcvt_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai float64x1_t vcvt_f64_s64(int64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vcvt_f64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvt_f64_f32(float32x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vcvt_f64_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vcvt_f64_f32(float32x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vcvt_f64_f32((int8x8_t)__p0, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float16x8_t vcvt_high_f16_f32(float16x4_t __p0, float32x4_t __p1) {
+  float16x8_t __ret;
+  __ret = vcombine_f16(__p0, vcvt_f16_f32(__p1));
+  return __ret;
+}
+#else
+__ai float16x8_t vcvt_high_f16_f32(float16x4_t __p0, float32x4_t __p1) {
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float16x8_t __ret;
+  __ret = __noswap_vcombine_f16(__rev0, __noswap_vcvt_f16_f32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvt_high_f32_f16(float16x8_t __p0) {
+  float32x4_t __ret;
+  __ret = vcvt_f32_f16(vget_high_f16(__p0));
+  return __ret;
+}
+#else
+__ai float32x4_t vcvt_high_f32_f16(float16x8_t __p0) {
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vcvt_f32_f16(__noswap_vget_high_f16(__rev0));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvt_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x4_t __ret;
+  __ret = vcombine_f32(__p0, vcvt_f32_f64(__p1));
+  return __ret;
+}
+#else
+__ai float32x4_t vcvt_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vcombine_f32(__rev0, __noswap_vcvt_f32_f64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vcvt_high_f64_f32(float32x4_t __p0) {
+  float64x2_t __ret;
+  __ret = vcvt_f64_f32(vget_high_f32(__p0));
+  return __ret;
+}
+#else
+__ai float64x2_t vcvt_high_f64_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vcvt_f64_f32(__noswap_vget_high_f32(__rev0));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_f32_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_f32_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vcvts_n_f32_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vcvtq_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vcvtq_n_f64_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vcvt_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vcvt_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vcvt_n_f64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_f64_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_f64_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vcvtd_n_f64_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvts_n_s32_f32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_s32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vcvts_n_s32_f32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vcvtq_n_s64_v((int8x16_t)__s0, __p1, 35); \
+  __ret; \
+})
+#else
+#define vcvtq_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = (int64x2_t) __builtin_neon_vcvtq_n_s64_v((int8x16_t)__rev0, __p1, 35); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vcvt_n_s64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#else
+#define vcvt_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = (int64x1_t) __builtin_neon_vcvt_n_s64_v((int8x8_t)__s0, __p1, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtd_n_s64_f64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_s64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vcvtd_n_s64_f64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvts_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvts_n_u32_f32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvts_n_u32_f32(__p0, __p1) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vcvts_n_u32_f32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtq_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_n_u64_v((int8x16_t)__s0, __p1, 51); \
+  __ret; \
+})
+#else
+#define vcvtq_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_n_u64_v((int8x16_t)__rev0, __p1, 51); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvt_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vcvt_n_u64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#else
+#define vcvt_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = (uint64x1_t) __builtin_neon_vcvt_n_u64_v((int8x8_t)__s0, __p1, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcvtd_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtd_n_u64_f64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vcvtd_n_u64_f64(__p0, __p1) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vcvtd_n_u64_f64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvts_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvts_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvts_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvts_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vcvtq_s64_f64(float64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtq_s64_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vcvtq_s64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vcvtq_s64_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vcvt_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvt_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vcvt_s64_f64(float64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vcvt_s64_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvts_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvts_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvts_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvts_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vcvtq_u64_f64(float64x2_t __p0) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_u64_v((int8x16_t)__p0, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vcvtq_u64_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vcvtq_u64_v((int8x16_t)__rev0, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vcvt_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvt_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vcvt_u64_f64(float64x1_t __p0) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vcvt_u64_v((int8x8_t)__p0, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtas_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtas_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtas_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtas_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtad_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtad_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtad_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtad_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtas_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtas_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtas_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtas_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtad_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtad_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtad_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtad_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtms_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtms_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtms_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtms_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtmd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtmd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtmd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtmd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtms_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtms_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtms_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtms_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtmd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtmd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtmd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtmd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtns_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtns_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtns_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtns_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtnd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtnd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtnd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtnd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtns_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtns_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtns_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtns_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtnd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtnd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtnd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtnd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vcvtps_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtps_s32_f32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vcvtps_s32_f32(float32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vcvtps_s32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vcvtpd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtpd_s64_f64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vcvtpd_s64_f64(float64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vcvtpd_s64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vcvtps_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtps_u32_f32(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vcvtps_u32_f32(float32_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vcvtps_u32_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vcvtpd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtpd_u64_f64(__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vcvtpd_u64_f64(float64_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vcvtpd_u64_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vcvtxd_f32_f64(float64_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvtxd_f32_f64(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vcvtxd_f32_f64(float64_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vcvtxd_f32_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vcvtx_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float32x2_t vcvtx_f32_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vcvtx_f32_f64(float64x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vcvtx_f32_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vcvtx_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x4_t __ret;
+  __ret = vcombine_f32(__p0, vcvtx_f32_f64(__p1));
+  return __ret;
+}
+#else
+__ai float32x4_t vcvtx_high_f32_f64(float32x2_t __p0, float64x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vcombine_f32(__rev0, __noswap_vcvtx_f32_f64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vdivq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vdivq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vdivq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float32x4_t vdivq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vdiv_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vdiv_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __p0 / __p1;
+  return __ret;
+}
+#else
+__ai float32x2_t vdiv_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __rev0 / __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_lane_p8(__p0, __p1) __extension__ ({ \
+  poly8x8_t __s0 = __p0; \
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_lane_p16(__p0, __p1) __extension__ ({ \
+  poly16x4_t __s0 = __p0; \
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_lane_u8(__p0, __p1) __extension__ ({ \
+  uint8x8_t __s0 = __p0; \
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_lane_u32(__p0, __p1) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_lane_u64(__p0, __p1) __extension__ ({ \
+  uint64x1_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_lane_u16(__p0, __p1) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_lane_s8(__p0, __p1) __extension__ ({ \
+  int8x8_t __s0 = __p0; \
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_lane_i8((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_lane_f32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_lane_f32(__p0, __p1) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_lane_f32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_lane_s32(__p0, __p1) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_lane_i32((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_lane_s64(__p0, __p1) __extension__ ({ \
+  int64x1_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_lane_s16(__p0, __p1) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_lane_i16((int8x8_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_lane_f16(__p0, __p1) __extension__ ({ \
+  float16x4_t __s0 = __p0; \
+  float16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8_t __ret; \
+  __ret = (poly8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16_t __ret; \
+  __ret = (poly16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupb_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupb_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vdupb_laneq_i8((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_laneq_f64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vdupd_laneq_f64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_laneq_f32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vdups_laneq_f32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdups_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdups_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vdups_laneq_i32((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupd_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vdupd_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vdupd_laneq_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vduph_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vduph_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vduph_laneq_i16((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_p8(__p0, __p1) __extension__ ({ \
+  poly8x16_t __s0 = __p0; \
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_p16(__p0, __p1) __extension__ ({ \
+  poly16x8_t __s0 = __p0; \
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdupq_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdupq_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u8(__p0, __p1) __extension__ ({ \
+  uint8x16_t __s0 = __p0; \
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u32(__p0, __p1) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u64(__p0, __p1) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_u16(__p0, __p1) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s8(__p0, __p1) __extension__ ({ \
+  int8x16_t __s0 = __p0; \
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x8_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_f32(__p0, __p1) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_f16(__p0, __p1) __extension__ ({ \
+  float16x8_t __s0 = __p0; \
+  float16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s32(__p0, __p1) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s64(__p0, __p1) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int64x1_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vdup_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \
+  __ret; \
+})
+#else
+#define vdup_laneq_s16(__p0, __p1) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vdup_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai poly64x1_t vdup_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vdupq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai poly64x2_t vdupq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vdupq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float64x2_t vdupq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vdup_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai float64x1_t vdup_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vext_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vextq_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vextq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 42); \
+  __ret; \
+})
+#else
+#define vextq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vext_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vext_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vfmaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+__ai float64x1_t __noswap_vfma_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmad_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_lane_f64(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmad_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_lane_f64(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmad_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_lane_f64(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmas_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_lane_f32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 42); \
+  __ret; \
+})
+#else
+#define vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__s2, __p3, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 42); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 41); \
+  __ret; \
+})
+#else
+#define vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, __p3, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 41); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 10); \
+  __ret; \
+})
+#else
+#define vfma_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 10); \
+  __ret; \
+})
+#define __noswap_vfma_lane_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 9); \
+  __ret; \
+})
+#else
+#define vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, __p3, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfma_lane_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 9); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmad_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vfmad_laneq_f64(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#define __noswap_vfmas_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32_t __s0 = __p0; \
+  float32_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32_t __ret; \
+  __ret = (float32_t) __builtin_neon_vfmas_laneq_f32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 42); \
+  __ret; \
+})
+#else
+#define vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 42); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 41); \
+  __ret; \
+})
+#else
+#define vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 41); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfmaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = (float32x4_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 41); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 10); \
+  __ret; \
+})
+#else
+#define vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__rev2, __p3, 10); \
+  __ret; \
+})
+#define __noswap_vfma_laneq_f64(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __s2 = __p2; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 9); \
+  __ret; \
+})
+#else
+#define vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x16_t)__rev2, __p3, 9); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vfma_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = (float32x2_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 9); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = vfmaq_f64(__p0, __p1, (float64x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float64x2_t vfmaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vfmaq_f64(__rev0, __rev1, (float64x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = vfmaq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vfmaq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = vfma_f32(__p0, __p1, (float32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __noswap_vfma_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = vfmaq_f64(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float64x2_t vfmsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = vfma_f64(__p0, -__p1, __p2);
+  return __ret;
+}
+#else
+__ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __noswap_vfma_f64(__p0, -__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsd_lane_f64(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \
+  float64_t __s0_88 = __p0_88; \
+  float64_t __s1_88 = __p1_88; \
+  float64x1_t __s2_88 = __p2_88; \
+  float64_t __ret_88; \
+  __ret_88 = vfmad_lane_f64(__s0_88, -__s1_88, __s2_88, __p3_88); \
+  __ret_88; \
+})
+#else
+#define vfmsd_lane_f64(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \
+  float64_t __s0_89 = __p0_89; \
+  float64_t __s1_89 = __p1_89; \
+  float64x1_t __s2_89 = __p2_89; \
+  float64_t __ret_89; \
+  __ret_89 = __noswap_vfmad_lane_f64(__s0_89, -__s1_89, __s2_89, __p3_89); \
+  __ret_89; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmss_lane_f32(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \
+  float32_t __s0_90 = __p0_90; \
+  float32_t __s1_90 = __p1_90; \
+  float32x2_t __s2_90 = __p2_90; \
+  float32_t __ret_90; \
+  __ret_90 = vfmas_lane_f32(__s0_90, -__s1_90, __s2_90, __p3_90); \
+  __ret_90; \
+})
+#else
+#define vfmss_lane_f32(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \
+  float32_t __s0_91 = __p0_91; \
+  float32_t __s1_91 = __p1_91; \
+  float32x2_t __s2_91 = __p2_91; \
+  float32x2_t __rev2_91;  __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 1, 0); \
+  float32_t __ret_91; \
+  __ret_91 = __noswap_vfmas_lane_f32(__s0_91, -__s1_91, __rev2_91, __p3_91); \
+  __ret_91; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_lane_f64(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \
+  float64x2_t __s0_92 = __p0_92; \
+  float64x2_t __s1_92 = __p1_92; \
+  float64x1_t __s2_92 = __p2_92; \
+  float64x2_t __ret_92; \
+  __ret_92 = vfmaq_lane_f64(__s0_92, -__s1_92, __s2_92, __p3_92); \
+  __ret_92; \
+})
+#else
+#define vfmsq_lane_f64(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \
+  float64x2_t __s0_93 = __p0_93; \
+  float64x2_t __s1_93 = __p1_93; \
+  float64x1_t __s2_93 = __p2_93; \
+  float64x2_t __rev0_93;  __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \
+  float64x2_t __rev1_93;  __rev1_93 = __builtin_shufflevector(__s1_93, __s1_93, 1, 0); \
+  float64x2_t __ret_93; \
+  __ret_93 = __noswap_vfmaq_lane_f64(__rev0_93, -__rev1_93, __s2_93, __p3_93); \
+  __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \
+  __ret_93; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_lane_f32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \
+  float32x4_t __s0_94 = __p0_94; \
+  float32x4_t __s1_94 = __p1_94; \
+  float32x2_t __s2_94 = __p2_94; \
+  float32x4_t __ret_94; \
+  __ret_94 = vfmaq_lane_f32(__s0_94, -__s1_94, __s2_94, __p3_94); \
+  __ret_94; \
+})
+#else
+#define vfmsq_lane_f32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \
+  float32x4_t __s0_95 = __p0_95; \
+  float32x4_t __s1_95 = __p1_95; \
+  float32x2_t __s2_95 = __p2_95; \
+  float32x4_t __rev0_95;  __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 3, 2, 1, 0); \
+  float32x4_t __rev1_95;  __rev1_95 = __builtin_shufflevector(__s1_95, __s1_95, 3, 2, 1, 0); \
+  float32x2_t __rev2_95;  __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 1, 0); \
+  float32x4_t __ret_95; \
+  __ret_95 = __noswap_vfmaq_lane_f32(__rev0_95, -__rev1_95, __rev2_95, __p3_95); \
+  __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 3, 2, 1, 0); \
+  __ret_95; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_lane_f64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \
+  float64x1_t __s0_96 = __p0_96; \
+  float64x1_t __s1_96 = __p1_96; \
+  float64x1_t __s2_96 = __p2_96; \
+  float64x1_t __ret_96; \
+  __ret_96 = vfma_lane_f64(__s0_96, -__s1_96, __s2_96, __p3_96); \
+  __ret_96; \
+})
+#else
+#define vfms_lane_f64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \
+  float64x1_t __s0_97 = __p0_97; \
+  float64x1_t __s1_97 = __p1_97; \
+  float64x1_t __s2_97 = __p2_97; \
+  float64x1_t __ret_97; \
+  __ret_97 = __noswap_vfma_lane_f64(__s0_97, -__s1_97, __s2_97, __p3_97); \
+  __ret_97; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_lane_f32(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \
+  float32x2_t __s0_98 = __p0_98; \
+  float32x2_t __s1_98 = __p1_98; \
+  float32x2_t __s2_98 = __p2_98; \
+  float32x2_t __ret_98; \
+  __ret_98 = vfma_lane_f32(__s0_98, -__s1_98, __s2_98, __p3_98); \
+  __ret_98; \
+})
+#else
+#define vfms_lane_f32(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \
+  float32x2_t __s0_99 = __p0_99; \
+  float32x2_t __s1_99 = __p1_99; \
+  float32x2_t __s2_99 = __p2_99; \
+  float32x2_t __rev0_99;  __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 1, 0); \
+  float32x2_t __rev1_99;  __rev1_99 = __builtin_shufflevector(__s1_99, __s1_99, 1, 0); \
+  float32x2_t __rev2_99;  __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 1, 0); \
+  float32x2_t __ret_99; \
+  __ret_99 = __noswap_vfma_lane_f32(__rev0_99, -__rev1_99, __rev2_99, __p3_99); \
+  __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 1, 0); \
+  __ret_99; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsd_laneq_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \
+  float64_t __s0_100 = __p0_100; \
+  float64_t __s1_100 = __p1_100; \
+  float64x2_t __s2_100 = __p2_100; \
+  float64_t __ret_100; \
+  __ret_100 = vfmad_laneq_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \
+  __ret_100; \
+})
+#else
+#define vfmsd_laneq_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \
+  float64_t __s0_101 = __p0_101; \
+  float64_t __s1_101 = __p1_101; \
+  float64x2_t __s2_101 = __p2_101; \
+  float64x2_t __rev2_101;  __rev2_101 = __builtin_shufflevector(__s2_101, __s2_101, 1, 0); \
+  float64_t __ret_101; \
+  __ret_101 = __noswap_vfmad_laneq_f64(__s0_101, -__s1_101, __rev2_101, __p3_101); \
+  __ret_101; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmss_laneq_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \
+  float32_t __s0_102 = __p0_102; \
+  float32_t __s1_102 = __p1_102; \
+  float32x4_t __s2_102 = __p2_102; \
+  float32_t __ret_102; \
+  __ret_102 = vfmas_laneq_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \
+  __ret_102; \
+})
+#else
+#define vfmss_laneq_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \
+  float32_t __s0_103 = __p0_103; \
+  float32_t __s1_103 = __p1_103; \
+  float32x4_t __s2_103 = __p2_103; \
+  float32x4_t __rev2_103;  __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 3, 2, 1, 0); \
+  float32_t __ret_103; \
+  __ret_103 = __noswap_vfmas_laneq_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \
+  __ret_103; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_laneq_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \
+  float64x2_t __s0_104 = __p0_104; \
+  float64x2_t __s1_104 = __p1_104; \
+  float64x2_t __s2_104 = __p2_104; \
+  float64x2_t __ret_104; \
+  __ret_104 = vfmaq_laneq_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \
+  __ret_104; \
+})
+#else
+#define vfmsq_laneq_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \
+  float64x2_t __s0_105 = __p0_105; \
+  float64x2_t __s1_105 = __p1_105; \
+  float64x2_t __s2_105 = __p2_105; \
+  float64x2_t __rev0_105;  __rev0_105 = __builtin_shufflevector(__s0_105, __s0_105, 1, 0); \
+  float64x2_t __rev1_105;  __rev1_105 = __builtin_shufflevector(__s1_105, __s1_105, 1, 0); \
+  float64x2_t __rev2_105;  __rev2_105 = __builtin_shufflevector(__s2_105, __s2_105, 1, 0); \
+  float64x2_t __ret_105; \
+  __ret_105 = __noswap_vfmaq_laneq_f64(__rev0_105, -__rev1_105, __rev2_105, __p3_105); \
+  __ret_105 = __builtin_shufflevector(__ret_105, __ret_105, 1, 0); \
+  __ret_105; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfmsq_laneq_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \
+  float32x4_t __s0_106 = __p0_106; \
+  float32x4_t __s1_106 = __p1_106; \
+  float32x4_t __s2_106 = __p2_106; \
+  float32x4_t __ret_106; \
+  __ret_106 = vfmaq_laneq_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \
+  __ret_106; \
+})
+#else
+#define vfmsq_laneq_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \
+  float32x4_t __s0_107 = __p0_107; \
+  float32x4_t __s1_107 = __p1_107; \
+  float32x4_t __s2_107 = __p2_107; \
+  float32x4_t __rev0_107;  __rev0_107 = __builtin_shufflevector(__s0_107, __s0_107, 3, 2, 1, 0); \
+  float32x4_t __rev1_107;  __rev1_107 = __builtin_shufflevector(__s1_107, __s1_107, 3, 2, 1, 0); \
+  float32x4_t __rev2_107;  __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 3, 2, 1, 0); \
+  float32x4_t __ret_107; \
+  __ret_107 = __noswap_vfmaq_laneq_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \
+  __ret_107 = __builtin_shufflevector(__ret_107, __ret_107, 3, 2, 1, 0); \
+  __ret_107; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_laneq_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \
+  float64x1_t __s0_108 = __p0_108; \
+  float64x1_t __s1_108 = __p1_108; \
+  float64x2_t __s2_108 = __p2_108; \
+  float64x1_t __ret_108; \
+  __ret_108 = vfma_laneq_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \
+  __ret_108; \
+})
+#else
+#define vfms_laneq_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \
+  float64x1_t __s0_109 = __p0_109; \
+  float64x1_t __s1_109 = __p1_109; \
+  float64x2_t __s2_109 = __p2_109; \
+  float64x2_t __rev2_109;  __rev2_109 = __builtin_shufflevector(__s2_109, __s2_109, 1, 0); \
+  float64x1_t __ret_109; \
+  __ret_109 = __noswap_vfma_laneq_f64(__s0_109, -__s1_109, __rev2_109, __p3_109); \
+  __ret_109; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vfms_laneq_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \
+  float32x2_t __s0_110 = __p0_110; \
+  float32x2_t __s1_110 = __p1_110; \
+  float32x4_t __s2_110 = __p2_110; \
+  float32x2_t __ret_110; \
+  __ret_110 = vfma_laneq_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \
+  __ret_110; \
+})
+#else
+#define vfms_laneq_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \
+  float32x2_t __s0_111 = __p0_111; \
+  float32x2_t __s1_111 = __p1_111; \
+  float32x4_t __s2_111 = __p2_111; \
+  float32x2_t __rev0_111;  __rev0_111 = __builtin_shufflevector(__s0_111, __s0_111, 1, 0); \
+  float32x2_t __rev1_111;  __rev1_111 = __builtin_shufflevector(__s1_111, __s1_111, 1, 0); \
+  float32x4_t __rev2_111;  __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 3, 2, 1, 0); \
+  float32x2_t __ret_111; \
+  __ret_111 = __noswap_vfma_laneq_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \
+  __ret_111 = __builtin_shufflevector(__ret_111, __ret_111, 1, 0); \
+  __ret_111; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = vfmaq_f64(__p0, -__p1, (float64x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __noswap_vfmaq_f64(__rev0, -__rev1, (float64x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __ret;
+  __ret = vfmaq_f32(__p0, -__p1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __noswap_vfmaq_f32(__rev0, -__rev1, (float32x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __ret;
+  __ret = vfma_f32(__p0, -__p1, (float32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __noswap_vfma_f32(__rev0, -__rev1, (float32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vget_high_p64(poly64x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai poly64x1_t vget_high_p64(poly64x2_t __p0) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+__ai poly64x1_t __noswap_vget_high_p64(poly64x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vget_high_f64(float64x2_t __p0) {
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 1);
+  return __ret;
+}
+#else
+__ai float64x1_t vget_high_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vget_lane_i64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64_t __ret; \
+  __ret = (poly64_t) __builtin_neon_vgetq_lane_i64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vgetq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((int8x16_t)__rev0, __p1); \
+  __ret; \
+})
+#define __noswap_vgetq_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vgetq_lane_f64((int8x16_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vget_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#else
+#define vget_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vget_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#define __noswap_vget_lane_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64_t __ret; \
+  __ret = (float64_t) __builtin_neon_vget_lane_f64((int8x8_t)__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vget_low_p64(poly64x2_t __p0) {
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai poly64x1_t vget_low_p64(poly64x2_t __p0) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vget_low_f64(float64x2_t __p0) {
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p0, 0);
+  return __ret;
+}
+#else
+__ai float64x1_t vget_low_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x1_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev0, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_v(__p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_v(__p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_v(__p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_v(__p0, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_v(__p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_v(__p0, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_v(__p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_v(__p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_dup_v(__p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_dup_p64(__p0) __extension__ ({ \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_dup_v(__p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_dup_v(__p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_dup_p64(__p0) __extension__ ({ \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_dup_v(__p0, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_dup_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_dup_v(__p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_dup_f64(__p0) __extension__ ({ \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_dup_v(__p0, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_dup_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_dup_v(__p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_dup_f64(__p0) __extension__ ({ \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_dup_v(__p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vld1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vld1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__s1, __p2, 42); \
+  __ret; \
+})
+#else
+#define vld1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vld1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 42); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vld1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vld1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x2(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x2(__p0) __extension__ ({ \
+  poly8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64_x2(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64_x2(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x2(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x2(__p0) __extension__ ({ \
+  poly16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x2(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x2(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64_x2(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64_x2(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x2(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x2(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x2(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x2(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x2(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x2(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x2(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x2(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x2(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x2(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x2(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x2(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64_x2(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64_x2(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x2(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x2(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x2(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x2(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x2(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x2(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x2(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x2(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x2(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x2(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x2(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x2(__p0) __extension__ ({ \
+  uint8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x2(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x2(__p0) __extension__ ({ \
+  uint32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x2(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x2(__p0) __extension__ ({ \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x2(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x2(__p0) __extension__ ({ \
+  uint16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x2(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x2(__p0) __extension__ ({ \
+  int8x8x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64_x2(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64_x2(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x2(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x2(__p0) __extension__ ({ \
+  float32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x2(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x2(__p0) __extension__ ({ \
+  float16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x2(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x2(__p0) __extension__ ({ \
+  int32x2x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x2(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x2(__p0) __extension__ ({ \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x2(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x2(__p0) __extension__ ({ \
+  int16x4x2_t __ret; \
+  __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x3(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x3(__p0) __extension__ ({ \
+  poly8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64_x3(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64_x3(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x3(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x3(__p0) __extension__ ({ \
+  poly16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x3(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x3(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64_x3(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64_x3(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x3(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x3(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x3(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x3(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x3(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x3(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x3(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x3(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x3(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x3(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x3(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x3(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64_x3(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64_x3(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x3(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x3(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x3(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x3(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x3(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x3(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x3(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x3(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x3(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x3(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x3(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x3(__p0) __extension__ ({ \
+  uint8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x3(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x3(__p0) __extension__ ({ \
+  uint32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x3(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x3(__p0) __extension__ ({ \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x3(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x3(__p0) __extension__ ({ \
+  uint16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x3(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x3(__p0) __extension__ ({ \
+  int8x8x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64_x3(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64_x3(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x3(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x3(__p0) __extension__ ({ \
+  float32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x3(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x3(__p0) __extension__ ({ \
+  float16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x3(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x3(__p0) __extension__ ({ \
+  int32x2x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x3(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x3(__p0) __extension__ ({ \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x3(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x3(__p0) __extension__ ({ \
+  int16x4x3_t __ret; \
+  __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p8_x4(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
+  __ret; \
+})
+#else
+#define vld1_p8_x4(__p0) __extension__ ({ \
+  poly8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p64_x4(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld1_p64_x4(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_p16_x4(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
+  __ret; \
+})
+#else
+#define vld1_p16_x4(__p0) __extension__ ({ \
+  poly16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p8_x4(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld1q_p8_x4(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p64_x4(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld1q_p64_x4(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_p16_x4(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld1q_p16_x4(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u8_x4(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld1q_u8_x4(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u32_x4(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld1q_u32_x4(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u64_x4(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld1q_u64_x4(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_u16_x4(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld1q_u16_x4(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s8_x4(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld1q_s8_x4(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f64_x4(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld1q_f64_x4(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f32_x4(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld1q_f32_x4(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_f16_x4(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld1q_f16_x4(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s32_x4(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld1q_s32_x4(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s64_x4(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld1q_s64_x4(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1q_s16_x4(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld1q_s16_x4(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u8_x4(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
+  __ret; \
+})
+#else
+#define vld1_u8_x4(__p0) __extension__ ({ \
+  uint8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u32_x4(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
+  __ret; \
+})
+#else
+#define vld1_u32_x4(__p0) __extension__ ({ \
+  uint32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u64_x4(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#else
+#define vld1_u64_x4(__p0) __extension__ ({ \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_u16_x4(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
+  __ret; \
+})
+#else
+#define vld1_u16_x4(__p0) __extension__ ({ \
+  uint16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s8_x4(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
+  __ret; \
+})
+#else
+#define vld1_s8_x4(__p0) __extension__ ({ \
+  int8x8x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f64_x4(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld1_f64_x4(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f32_x4(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
+  __ret; \
+})
+#else
+#define vld1_f32_x4(__p0) __extension__ ({ \
+  float32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_f16_x4(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
+  __ret; \
+})
+#else
+#define vld1_f16_x4(__p0) __extension__ ({ \
+  float16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s32_x4(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
+  __ret; \
+})
+#else
+#define vld1_s32_x4(__p0) __extension__ ({ \
+  int32x2x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s64_x4(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#else
+#define vld1_s64_x4(__p0) __extension__ ({ \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld1_s16_x4(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
+  __ret; \
+})
+#else
+#define vld1_s16_x4(__p0) __extension__ ({ \
+  int16x4x4_t __ret; \
+  __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld2_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld2q_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld2q_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld2q_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld2q_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld2_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld2_dup_p64(__p0) __extension__ ({ \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld2q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld2q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s8(__p0) __extension__ ({ \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f64(__p0) __extension__ ({ \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f32(__p0) __extension__ ({ \
+  float32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld2q_dup_f16(__p0) __extension__ ({ \
+  float16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s32(__p0) __extension__ ({ \
+  int32x4x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s64(__p0) __extension__ ({ \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_dup_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld2q_dup_s16(__p0) __extension__ ({ \
+  int16x8x2_t __ret; \
+  __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_dup_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld2_dup_f64(__p0) __extension__ ({ \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+  __ret; \
+})
+#else
+#define vld2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  poly64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 36); \
+  __ret; \
+})
+#else
+#define vld2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 38); \
+  __ret; \
+})
+#else
+#define vld2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  poly64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 48); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 51); \
+  __ret; \
+})
+#else
+#define vld2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  uint64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 32); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 42); \
+  __ret; \
+})
+#else
+#define vld2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  float64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 35); \
+  __ret; \
+})
+#else
+#define vld2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  int64x2x2_t __ret; \
+  __builtin_neon_vld2q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __p2, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+  __ret; \
+})
+#else
+#define vld2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  uint64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 10); \
+  __ret; \
+})
+#else
+#define vld2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  float64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 3); \
+  __ret; \
+})
+#else
+#define vld2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  int64x1x2_t __ret; \
+  __builtin_neon_vld2_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld3_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld3q_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld3q_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld3q_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld3q_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld3_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld3_dup_p64(__p0) __extension__ ({ \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld3q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld3q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s8(__p0) __extension__ ({ \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f64(__p0) __extension__ ({ \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f32(__p0) __extension__ ({ \
+  float32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld3q_dup_f16(__p0) __extension__ ({ \
+  float16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s32(__p0) __extension__ ({ \
+  int32x4x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s64(__p0) __extension__ ({ \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_dup_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld3q_dup_s16(__p0) __extension__ ({ \
+  int16x8x3_t __ret; \
+  __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_dup_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld3_dup_f64(__p0) __extension__ ({ \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+  __ret; \
+})
+#else
+#define vld3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  poly64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 36); \
+  __ret; \
+})
+#else
+#define vld3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 38); \
+  __ret; \
+})
+#else
+#define vld3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  poly64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 48); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 51); \
+  __ret; \
+})
+#else
+#define vld3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  uint64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 32); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 42); \
+  __ret; \
+})
+#else
+#define vld3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  float64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 35); \
+  __ret; \
+})
+#else
+#define vld3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  int64x2x3_t __ret; \
+  __builtin_neon_vld3q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+  __ret; \
+})
+#else
+#define vld3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  uint64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+  __ret; \
+})
+#else
+#define vld3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  float64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+  __ret; \
+})
+#else
+#define vld3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  int64x1x3_t __ret; \
+  __builtin_neon_vld3_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld4_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld4q_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld4q_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld4q_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld4q_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld4_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#else
+#define vld4_dup_p64(__p0) __extension__ ({ \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p8(__p0) __extension__ ({ \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p64(__p0) __extension__ ({ \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
+  __ret; \
+})
+#else
+#define vld4q_dup_p16(__p0) __extension__ ({ \
+  poly16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u8(__p0) __extension__ ({ \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u32(__p0) __extension__ ({ \
+  uint32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u64(__p0) __extension__ ({ \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
+  __ret; \
+})
+#else
+#define vld4q_dup_u16(__p0) __extension__ ({ \
+  uint16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s8(__p0) __extension__ ({ \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f64(__p0) __extension__ ({ \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f32(__p0) __extension__ ({ \
+  float32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
+  __ret; \
+})
+#else
+#define vld4q_dup_f16(__p0) __extension__ ({ \
+  float16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s32(__p0) __extension__ ({ \
+  int32x4x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s64(__p0) __extension__ ({ \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_dup_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
+  __ret; \
+})
+#else
+#define vld4q_dup_s16(__p0) __extension__ ({ \
+  int16x8x4_t __ret; \
+  __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_dup_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#else
+#define vld4_dup_f64(__p0) __extension__ ({ \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_dup_v(&__ret, __p0, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+  __ret; \
+})
+#else
+#define vld4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  poly64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 36); \
+  __ret; \
+})
+#else
+#define vld4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  poly8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 36); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 38); \
+  __ret; \
+})
+#else
+#define vld4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  poly64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 38); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 48); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 48); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 51); \
+  __ret; \
+})
+#else
+#define vld4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  uint64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 51); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 32); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 32); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 42); \
+  __ret; \
+})
+#else
+#define vld4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  float64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 42); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 35); \
+  __ret; \
+})
+#else
+#define vld4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  int64x2x4_t __ret; \
+  __builtin_neon_vld4q_lane_v(&__ret, __p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 35); \
+ \
+  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \
+  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \
+  __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \
+  __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+  __ret; \
+})
+#else
+#define vld4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  uint64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+  __ret; \
+})
+#else
+#define vld4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  float64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vld4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+  __ret; \
+})
+#else
+#define vld4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  int64x1x4_t __ret; \
+  __builtin_neon_vld4_lane_v(&__ret, __p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vldrq_p128(__p0) __extension__ ({ \
+  poly128_t __ret; \
+  __ret = (poly128_t) __builtin_neon_vldrq_p128(__p0); \
+  __ret; \
+})
+#else
+#define vldrq_p128(__p0) __extension__ ({ \
+  poly128_t __ret; \
+  __ret = (poly128_t) __builtin_neon_vldrq_p128(__p0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmax_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmax_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vmaxnmvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxnmvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vmaxnmvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxnmvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxnmvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxnmvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxnmv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxnmv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxnmv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vmaxvq_u8(uint8x16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vmaxvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vmaxvq_u32(uint32x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vmaxvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vmaxvq_u16(uint16x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vmaxvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vmaxvq_s8(int8x16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vmaxvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vmaxvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vmaxvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmaxvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vmaxvq_s32(int32x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vmaxvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vmaxvq_s16(int16x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vmaxvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vmaxv_u8(uint8x8_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vmaxv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vmaxv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vmaxv_u32(uint32x2_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vmaxv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vmaxv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vmaxv_u16(uint16x4_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vmaxv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vmaxv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vmaxv_s8(int8x8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vmaxv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vmaxv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmaxv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vmaxv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmaxv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vmaxv_s32(int32x2_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vmaxv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vmaxv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vmaxv_s16(int16x4_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vmaxv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vmaxv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmin_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmin_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vminnmvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminnmvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vminnmvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminnmvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminnmvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminnmvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminnmv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminnmv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminnmv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vminvq_u8(uint8x16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminvq_u8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vminvq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminvq_u8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vminvq_u32(uint32x4_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminvq_u32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vminvq_u32(uint32x4_t __p0) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminvq_u32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vminvq_u16(uint16x8_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminvq_u16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vminvq_u16(uint16x8_t __p0) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminvq_u16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vminvq_s8(int8x16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminvq_s8((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vminvq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminvq_s8((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vminvq_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminvq_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vminvq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vminvq_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminvq_f32(float32x4_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminvq_f32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminvq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminvq_f32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vminvq_s32(int32x4_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminvq_s32((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vminvq_s32(int32x4_t __p0) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminvq_s32((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vminvq_s16(int16x8_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminvq_s16((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vminvq_s16(int16x8_t __p0) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminvq_s16((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vminv_u8(uint8x8_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminv_u8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vminv_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vminv_u8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vminv_u32(uint32x2_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminv_u32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vminv_u32(uint32x2_t __p0) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vminv_u32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vminv_u16(uint16x4_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminv_u16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vminv_u16(uint16x4_t __p0) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vminv_u16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vminv_s8(int8x8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminv_s8((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int8_t vminv_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vminv_s8((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vminv_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminv_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vminv_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vminv_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vminv_s32(int32x2_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminv_s32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int32_t vminv_s32(int32x2_t __p0) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vminv_s32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vminv_s16(int16x4_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminv_s16((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai int16_t vminv_s16(int16x4_t __p0) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vminv_s16((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x2_t vmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 + __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 + __p1 * __p2;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlaq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlaq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmla_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 + __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmla_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 + __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 + __p1 * (float64x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 + __rev1 * (float64x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x2_t vmlsq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 - __rev1 * __rev2;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#else
+__ai float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1 * __p2;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmlsq_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_f32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __s2 = __p2; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmls_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __ret; \
+  __ret = __s0 - __s1 * __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3); \
+  __ret; \
+})
+#else
+#define vmls_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 - __rev1 * __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __ret;
+  __ret = __p0 - __p1 * (float64x2_t) {__p2, __p2};
+  return __ret;
+}
+#else
+__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 - __rev1 * (float64x2_t) {__p2, __p2};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(vget_high_u32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__noswap_vget_high_u32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(vget_high_u16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__noswap_vget_high_u16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x1_t vmov_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai poly64x1_t vmov_n_p64(poly64_t __p0) {
+  poly64x1_t __ret;
+  __ret = (poly64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vmovq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai poly64x2_t vmovq_n_p64(poly64_t __p0) {
+  poly64x2_t __ret;
+  __ret = (poly64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmovq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  return __ret;
+}
+#else
+__ai float64x2_t vmovq_n_f64(float64_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) {__p0, __p0};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmov_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#else
+__ai float64x1_t vmov_n_f64(float64_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) {__p0};
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_112) {
+  uint16x8_t __ret_112;
+  uint8x8_t __a1_112 = vget_high_u8(__p0_112);
+  __ret_112 = (uint16x8_t)(vshll_n_u8(__a1_112, 0));
+  return __ret_112;
+}
+#else
+__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_113) {
+  uint8x16_t __rev0_113;  __rev0_113 = __builtin_shufflevector(__p0_113, __p0_113, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret_113;
+  uint8x8_t __a1_113 = __noswap_vget_high_u8(__rev0_113);
+  __ret_113 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_113, 0));
+  __ret_113 = __builtin_shufflevector(__ret_113, __ret_113, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret_113;
+}
+__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_114) {
+  uint16x8_t __ret_114;
+  uint8x8_t __a1_114 = __noswap_vget_high_u8(__p0_114);
+  __ret_114 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_114, 0));
+  return __ret_114;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_115) {
+  uint64x2_t __ret_115;
+  uint32x2_t __a1_115 = vget_high_u32(__p0_115);
+  __ret_115 = (uint64x2_t)(vshll_n_u32(__a1_115, 0));
+  return __ret_115;
+}
+#else
+__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_116) {
+  uint32x4_t __rev0_116;  __rev0_116 = __builtin_shufflevector(__p0_116, __p0_116, 3, 2, 1, 0);
+  uint64x2_t __ret_116;
+  uint32x2_t __a1_116 = __noswap_vget_high_u32(__rev0_116);
+  __ret_116 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_116, 0));
+  __ret_116 = __builtin_shufflevector(__ret_116, __ret_116, 1, 0);
+  return __ret_116;
+}
+__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_117) {
+  uint64x2_t __ret_117;
+  uint32x2_t __a1_117 = __noswap_vget_high_u32(__p0_117);
+  __ret_117 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_117, 0));
+  return __ret_117;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_118) {
+  uint32x4_t __ret_118;
+  uint16x4_t __a1_118 = vget_high_u16(__p0_118);
+  __ret_118 = (uint32x4_t)(vshll_n_u16(__a1_118, 0));
+  return __ret_118;
+}
+#else
+__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_119) {
+  uint16x8_t __rev0_119;  __rev0_119 = __builtin_shufflevector(__p0_119, __p0_119, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret_119;
+  uint16x4_t __a1_119 = __noswap_vget_high_u16(__rev0_119);
+  __ret_119 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_119, 0));
+  __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0);
+  return __ret_119;
+}
+__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_120) {
+  uint32x4_t __ret_120;
+  uint16x4_t __a1_120 = __noswap_vget_high_u16(__p0_120);
+  __ret_120 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_120, 0));
+  return __ret_120;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_121) {
+  int16x8_t __ret_121;
+  int8x8_t __a1_121 = vget_high_s8(__p0_121);
+  __ret_121 = (int16x8_t)(vshll_n_s8(__a1_121, 0));
+  return __ret_121;
+}
+#else
+__ai int16x8_t vmovl_high_s8(int8x16_t __p0_122) {
+  int8x16_t __rev0_122;  __rev0_122 = __builtin_shufflevector(__p0_122, __p0_122, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret_122;
+  int8x8_t __a1_122 = __noswap_vget_high_s8(__rev0_122);
+  __ret_122 = (int16x8_t)(__noswap_vshll_n_s8(__a1_122, 0));
+  __ret_122 = __builtin_shufflevector(__ret_122, __ret_122, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret_122;
+}
+__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_123) {
+  int16x8_t __ret_123;
+  int8x8_t __a1_123 = __noswap_vget_high_s8(__p0_123);
+  __ret_123 = (int16x8_t)(__noswap_vshll_n_s8(__a1_123, 0));
+  return __ret_123;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_124) {
+  int64x2_t __ret_124;
+  int32x2_t __a1_124 = vget_high_s32(__p0_124);
+  __ret_124 = (int64x2_t)(vshll_n_s32(__a1_124, 0));
+  return __ret_124;
+}
+#else
+__ai int64x2_t vmovl_high_s32(int32x4_t __p0_125) {
+  int32x4_t __rev0_125;  __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 3, 2, 1, 0);
+  int64x2_t __ret_125;
+  int32x2_t __a1_125 = __noswap_vget_high_s32(__rev0_125);
+  __ret_125 = (int64x2_t)(__noswap_vshll_n_s32(__a1_125, 0));
+  __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 1, 0);
+  return __ret_125;
+}
+__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_126) {
+  int64x2_t __ret_126;
+  int32x2_t __a1_126 = __noswap_vget_high_s32(__p0_126);
+  __ret_126 = (int64x2_t)(__noswap_vshll_n_s32(__a1_126, 0));
+  return __ret_126;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_127) {
+  int32x4_t __ret_127;
+  int16x4_t __a1_127 = vget_high_s16(__p0_127);
+  __ret_127 = (int32x4_t)(vshll_n_s16(__a1_127, 0));
+  return __ret_127;
+}
+#else
+__ai int32x4_t vmovl_high_s16(int16x8_t __p0_128) {
+  int16x8_t __rev0_128;  __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret_128;
+  int16x4_t __a1_128 = __noswap_vget_high_s16(__rev0_128);
+  __ret_128 = (int32x4_t)(__noswap_vshll_n_s16(__a1_128, 0));
+  __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 3, 2, 1, 0);
+  return __ret_128;
+}
+__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_129) {
+  int32x4_t __ret_129;
+  int16x4_t __a1_129 = __noswap_vget_high_s16(__p0_129);
+  __ret_129 = (int32x4_t)(__noswap_vshll_n_s16(__a1_129, 0));
+  return __ret_129;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vmovn_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vmovn_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vmovn_u64(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vmovn_u64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vmovn_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint8x16_t vmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vmovn_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vmovn_s32(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vmovn_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vmovn_s64(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vmovn_s64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vmovn_s16(__p1));
+  return __ret;
+}
+#else
+__ai int8x16_t vmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vmovn_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmulq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vmulq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 * __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 * __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuld_lane_f64(__p0_130, __p1_130, __p2_130) __extension__ ({ \
+  float64_t __s0_130 = __p0_130; \
+  float64x1_t __s1_130 = __p1_130; \
+  float64_t __ret_130; \
+  __ret_130 = __s0_130 * vget_lane_f64(__s1_130, __p2_130); \
+  __ret_130; \
+})
+#else
+#define vmuld_lane_f64(__p0_131, __p1_131, __p2_131) __extension__ ({ \
+  float64_t __s0_131 = __p0_131; \
+  float64x1_t __s1_131 = __p1_131; \
+  float64_t __ret_131; \
+  __ret_131 = __s0_131 * __noswap_vget_lane_f64(__s1_131, __p2_131); \
+  __ret_131; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuls_lane_f32(__p0_132, __p1_132, __p2_132) __extension__ ({ \
+  float32_t __s0_132 = __p0_132; \
+  float32x2_t __s1_132 = __p1_132; \
+  float32_t __ret_132; \
+  __ret_132 = __s0_132 * vget_lane_f32(__s1_132, __p2_132); \
+  __ret_132; \
+})
+#else
+#define vmuls_lane_f32(__p0_133, __p1_133, __p2_133) __extension__ ({ \
+  float32_t __s0_133 = __p0_133; \
+  float32x2_t __s1_133 = __p1_133; \
+  float32x2_t __rev1_133;  __rev1_133 = __builtin_shufflevector(__s1_133, __s1_133, 1, 0); \
+  float32_t __ret_133; \
+  __ret_133 = __s0_133 * __noswap_vget_lane_f32(__rev1_133, __p2_133); \
+  __ret_133; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vmul_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_lane_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuld_laneq_f64(__p0_134, __p1_134, __p2_134) __extension__ ({ \
+  float64_t __s0_134 = __p0_134; \
+  float64x2_t __s1_134 = __p1_134; \
+  float64_t __ret_134; \
+  __ret_134 = __s0_134 * vgetq_lane_f64(__s1_134, __p2_134); \
+  __ret_134; \
+})
+#else
+#define vmuld_laneq_f64(__p0_135, __p1_135, __p2_135) __extension__ ({ \
+  float64_t __s0_135 = __p0_135; \
+  float64x2_t __s1_135 = __p1_135; \
+  float64x2_t __rev1_135;  __rev1_135 = __builtin_shufflevector(__s1_135, __s1_135, 1, 0); \
+  float64_t __ret_135; \
+  __ret_135 = __s0_135 * __noswap_vgetq_lane_f64(__rev1_135, __p2_135); \
+  __ret_135; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmuls_laneq_f32(__p0_136, __p1_136, __p2_136) __extension__ ({ \
+  float32_t __s0_136 = __p0_136; \
+  float32x4_t __s1_136 = __p1_136; \
+  float32_t __ret_136; \
+  __ret_136 = __s0_136 * vgetq_lane_f32(__s1_136, __p2_136); \
+  __ret_136; \
+})
+#else
+#define vmuls_laneq_f32(__p0_137, __p1_137, __p2_137) __extension__ ({ \
+  float32_t __s0_137 = __p0_137; \
+  float32x4_t __s1_137 = __p1_137; \
+  float32x4_t __rev1_137;  __rev1_137 = __builtin_shufflevector(__s1_137, __s1_137, 3, 2, 1, 0); \
+  float32_t __ret_137; \
+  __ret_137 = __s0_137 * __noswap_vgetq_lane_f32(__rev1_137, __p2_137); \
+  __ret_137; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_laneq_v((int8x8_t)__s0, (int8x16_t)__s1, __p2, 10); \
+  __ret; \
+})
+#else
+#define vmul_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vmul_laneq_v((int8x8_t)__s0, (int8x16_t)__rev1, __p2, 10); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmulq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmul_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \
+  __ret; \
+})
+#else
+#define vmul_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmul_n_f64(float64x1_t __p0, float64_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmul_n_f64((int8x8_t)__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64x1_t vmul_n_f64(float64x1_t __p0, float64_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmul_n_f64((int8x8_t)__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmulq_n_f64(float64x2_t __p0, float64_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 * (float64x2_t) {__p1, __p1};
+  return __ret;
+}
+#else
+__ai float64x2_t vmulq_n_f64(float64x2_t __p0, float64_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 * (float64x2_t) {__p1, __p1};
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vmull_p64(poly64_t __p0, poly64_t __p1) {
+  poly128_t __ret;
+  __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai poly128_t vmull_p64(poly64_t __p0, poly64_t __p1) {
+  poly128_t __ret;
+  __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1);
+  return __ret;
+}
+__ai poly128_t __noswap_vmull_p64(poly64_t __p0, poly64_t __p1) {
+  poly128_t __ret;
+  __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vmull_high_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly16x8_t __ret;
+  __ret = vmull_p8(vget_high_p8(__p0), vget_high_p8(__p1));
+  return __ret;
+}
+#else
+__ai poly16x8_t vmull_high_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __noswap_vmull_p8(__noswap_vget_high_p8(__rev0), __noswap_vget_high_p8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmull_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmull_u8(vget_high_u8(__p0), vget_high_u8(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmull_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmull_u8(__noswap_vget_high_u8(__rev0), __noswap_vget_high_u8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmull_u32(vget_high_u32(__p0), vget_high_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0), __noswap_vget_high_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmull_u16(vget_high_u16(__p0), vget_high_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0), __noswap_vget_high_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmull_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vmull_s8(vget_high_s8(__p0), vget_high_s8(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vmull_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmull_s8(__noswap_vget_high_s8(__rev0), __noswap_vget_high_s8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vmull_s32(vget_high_s32(__p0), vget_high_s32(__p1));
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vmull_s16(vget_high_s16(__p0), vget_high_s16(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly128_t vmull_high_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly128_t __ret;
+  __ret = vmull_p64((poly64_t)(vget_high_p64(__p0)), (poly64_t)(vget_high_p64(__p1)));
+  return __ret;
+}
+#else
+__ai poly128_t vmull_high_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly128_t __ret;
+  __ret = __noswap_vmull_p64((poly64_t)(__noswap_vget_high_p64(__rev0)), (poly64_t)(__noswap_vget_high_p64(__rev1)));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(vget_high_u32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(vget_high_u16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(vget_high_u32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__noswap_vget_high_u32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(vget_high_u16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x8_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__noswap_vget_high_u16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmull_high_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmull_n_u32(vget_high_u32(__p0), __p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmull_high_n_u32(uint32x4_t __p0, uint32_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmull_n_u32(__noswap_vget_high_u32(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmull_high_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmull_n_u16(vget_high_u16(__p0), __p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmull_high_n_u16(uint16x8_t __p0, uint16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmull_n_u16(__noswap_vget_high_u16(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = vmull_n_s32(vget_high_s32(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmull_n_s32(__noswap_vget_high_s32(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = vmull_n_s16(vget_high_s16(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmull_n_s16(__noswap_vget_high_s16(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint64x2_t __ret; \
+  __ret = vmull_u32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_u32(__p0, __p1, __p2) __extension__ ({ \
+  uint32x2_t __s0 = __p0; \
+  uint32x4_t __s1 = __p1; \
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __noswap_vmull_u32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint32x4_t __ret; \
+  __ret = vmull_u16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_u16(__p0, __p1, __p2) __extension__ ({ \
+  uint16x4_t __s0 = __p0; \
+  uint16x8_t __s1 = __p1; \
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __noswap_vmull_u16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float64x2_t __noswap_vmulxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai float32x4_t __noswap_vmulxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vmulx_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vmulx_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai float32x2_t __noswap_vmulx_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vmulxd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmulxd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vmulxd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmulxd_f64(__p0, __p1);
+  return __ret;
+}
+__ai float64_t __noswap_vmulxd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vmulxd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vmulxs_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vmulxs_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
+  return __ret;
+}
+__ai float32_t __noswap_vmulxs_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vmulxs_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxd_lane_f64(__p0_138, __p1_138, __p2_138) __extension__ ({ \
+  float64_t __s0_138 = __p0_138; \
+  float64x1_t __s1_138 = __p1_138; \
+  float64_t __ret_138; \
+  __ret_138 = vmulxd_f64(__s0_138, vget_lane_f64(__s1_138, __p2_138)); \
+  __ret_138; \
+})
+#else
+#define vmulxd_lane_f64(__p0_139, __p1_139, __p2_139) __extension__ ({ \
+  float64_t __s0_139 = __p0_139; \
+  float64x1_t __s1_139 = __p1_139; \
+  float64_t __ret_139; \
+  __ret_139 = __noswap_vmulxd_f64(__s0_139, __noswap_vget_lane_f64(__s1_139, __p2_139)); \
+  __ret_139; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxs_lane_f32(__p0_140, __p1_140, __p2_140) __extension__ ({ \
+  float32_t __s0_140 = __p0_140; \
+  float32x2_t __s1_140 = __p1_140; \
+  float32_t __ret_140; \
+  __ret_140 = vmulxs_f32(__s0_140, vget_lane_f32(__s1_140, __p2_140)); \
+  __ret_140; \
+})
+#else
+#define vmulxs_lane_f32(__p0_141, __p1_141, __p2_141) __extension__ ({ \
+  float32_t __s0_141 = __p0_141; \
+  float32x2_t __s1_141 = __p1_141; \
+  float32x2_t __rev1_141;  __rev1_141 = __builtin_shufflevector(__s1_141, __s1_141, 1, 0); \
+  float32_t __ret_141; \
+  __ret_141 = __noswap_vmulxs_f32(__s0_141, __noswap_vget_lane_f32(__rev1_141, __p2_141)); \
+  __ret_141; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = vmulxq_f64(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __noswap_vmulxq_f64(__rev0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = vmulxq_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __noswap_vmulxq_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = vmulx_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulx_lane_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x2_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __noswap_vmulx_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxd_laneq_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \
+  float64_t __s0_142 = __p0_142; \
+  float64x2_t __s1_142 = __p1_142; \
+  float64_t __ret_142; \
+  __ret_142 = vmulxd_f64(__s0_142, vgetq_lane_f64(__s1_142, __p2_142)); \
+  __ret_142; \
+})
+#else
+#define vmulxd_laneq_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \
+  float64_t __s0_143 = __p0_143; \
+  float64x2_t __s1_143 = __p1_143; \
+  float64x2_t __rev1_143;  __rev1_143 = __builtin_shufflevector(__s1_143, __s1_143, 1, 0); \
+  float64_t __ret_143; \
+  __ret_143 = __noswap_vmulxd_f64(__s0_143, __noswap_vgetq_lane_f64(__rev1_143, __p2_143)); \
+  __ret_143; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxs_laneq_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \
+  float32_t __s0_144 = __p0_144; \
+  float32x4_t __s1_144 = __p1_144; \
+  float32_t __ret_144; \
+  __ret_144 = vmulxs_f32(__s0_144, vgetq_lane_f32(__s1_144, __p2_144)); \
+  __ret_144; \
+})
+#else
+#define vmulxs_laneq_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \
+  float32_t __s0_145 = __p0_145; \
+  float32x4_t __s1_145 = __p1_145; \
+  float32x4_t __rev1_145;  __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 3, 2, 1, 0); \
+  float32_t __ret_145; \
+  __ret_145 = __noswap_vmulxs_f32(__s0_145, __noswap_vgetq_lane_f32(__rev1_145, __p2_145)); \
+  __ret_145; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = vmulxq_f64(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_laneq_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = __noswap_vmulxq_f64(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulxq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __ret; \
+  __ret = vmulxq_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulxq_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x4_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x4_t __ret; \
+  __ret = __noswap_vmulxq_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __ret; \
+  __ret = vmulx_f32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vmulx_laneq_f32(__p0, __p1, __p2) __extension__ ({ \
+  float32x2_t __s0 = __p0; \
+  float32x4_t __s1 = __p1; \
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  float32x2_t __ret; \
+  __ret = __noswap_vmulx_f32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vnegq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float64x2_t vnegq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vnegq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int64x2_t vnegq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = -__rev0;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vneg_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai float64x1_t vneg_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#else
+__ai int64x1_t vneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = -__p0;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vnegd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vnegd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vpaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vpaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vpaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vpaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vpaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vpaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpaddq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpaddq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vpaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vpaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vpaddd_u64(uint64x2_t __p0) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vpaddd_u64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai uint64_t vpaddd_u64(uint64x2_t __p0) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vpaddd_u64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpaddd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpaddd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpaddd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpaddd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vpaddd_s64(int64x2_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vpaddd_s64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai int64_t vpaddd_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vpaddd_s64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpadds_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpadds_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpadds_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpadds_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vpmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vpmaxq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpmaxq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpmaxq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vpmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vpmaxq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpmaxq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpmaxq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpmaxq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpmaxq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpmaxqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpmaxqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpmaxs_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxs_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpmaxs_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxs_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpmaxnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpmaxnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpmaxnmqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxnmqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpmaxnmqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpmaxnmqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpmaxnms_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxnms_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpmaxnms_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmaxnms_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vpminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vpminq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vpminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vpminq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vpminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vpminq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vpminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vpminq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpminq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpminq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vpminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vpminq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vpminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vpminq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpminqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpminqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpmins_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmins_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpmins_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpmins_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vpminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vpminnmq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vpminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vpminnmq_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vpminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vpminnm_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vpminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vpminnmqd_f64(float64x2_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminnmqd_f64((int8x16_t)__p0);
+  return __ret;
+}
+#else
+__ai float64_t vpminnmqd_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vpminnmqd_f64((int8x16_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vpminnms_f32(float32x2_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpminnms_f32((int8x8_t)__p0);
+  return __ret;
+}
+#else
+__ai float32_t vpminnms_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vpminnms_f32((int8x8_t)__rev0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqabsq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqabsq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqabsq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqabsq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqabs_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqabs_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqabsb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqabsb_s8(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqabsb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqabsb_s8(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqabss_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqabss_s32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqabss_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqabss_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqabsd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vqabsd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqabsd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqabsh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqabsh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqabsh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqabsh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqadds_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqadds_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqadds_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqadds_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqadds_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqaddh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqdmlals_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlals_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int64_t vqdmlals_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlals_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmlalh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlalh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int32_t vqdmlalh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlalh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlal_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlal_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlal_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlal_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlals_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_lane_s32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlals_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_lane_s32(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlalh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_lane_s16(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlalh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_lane_s16(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlals_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_laneq_s32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlals_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlals_laneq_s32(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlalh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_laneq_s16(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlalh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlalh_laneq_s16(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlal_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlal_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlal_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlal_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlal_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqdmlsls_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlsls_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int64_t vqdmlsls_s32(int64_t __p0, int32_t __p1, int32_t __p2) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmlsls_s32(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmlslh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlslh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#else
+__ai int32_t vqdmlslh_s16(int32_t __p0, int16_t __p1, int16_t __p2) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmlslh_s16(__p0, __p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlsl_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlsl_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, vget_high_s32(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, vget_high_s16(__s1), __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_high_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vqdmlsl_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmlsl_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vqdmlsl_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmlsl_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_lane_s32(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlsls_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_lane_s32(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlslh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_lane_s16(__s0, __s1, (int8x8_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlslh_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_lane_s16(__s0, __s1, (int8x8_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_laneq_s32(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlsls_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqdmlsls_laneq_s32(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlslh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_laneq_s16(__s0, __s1, (int8x16_t)__s2, __p3); \
+  __ret; \
+})
+#else
+#define vqdmlslh_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqdmlslh_laneq_s16(__s0, __s1, (int8x16_t)__rev2, __p3); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = vqdmlsl_s32(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmlsl_s32(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = vqdmlsl_s16(__s0, __s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vqdmlsl_laneq_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmlsl_s16(__rev0, __rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhs_lane_s32(__p0_146, __p1_146, __p2_146) __extension__ ({ \
+  int32_t __s0_146 = __p0_146; \
+  int32x2_t __s1_146 = __p1_146; \
+  int32_t __ret_146; \
+  __ret_146 = vqdmulhs_s32(__s0_146, vget_lane_s32(__s1_146, __p2_146)); \
+  __ret_146; \
+})
+#else
+#define vqdmulhs_lane_s32(__p0_147, __p1_147, __p2_147) __extension__ ({ \
+  int32_t __s0_147 = __p0_147; \
+  int32x2_t __s1_147 = __p1_147; \
+  int32x2_t __rev1_147;  __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \
+  int32_t __ret_147; \
+  __ret_147 = __noswap_vqdmulhs_s32(__s0_147, __noswap_vget_lane_s32(__rev1_147, __p2_147)); \
+  __ret_147; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhh_lane_s16(__p0_148, __p1_148, __p2_148) __extension__ ({ \
+  int16_t __s0_148 = __p0_148; \
+  int16x4_t __s1_148 = __p1_148; \
+  int16_t __ret_148; \
+  __ret_148 = vqdmulhh_s16(__s0_148, vget_lane_s16(__s1_148, __p2_148)); \
+  __ret_148; \
+})
+#else
+#define vqdmulhh_lane_s16(__p0_149, __p1_149, __p2_149) __extension__ ({ \
+  int16_t __s0_149 = __p0_149; \
+  int16x4_t __s1_149 = __p1_149; \
+  int16x4_t __rev1_149;  __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \
+  int16_t __ret_149; \
+  __ret_149 = __noswap_vqdmulhh_s16(__s0_149, __noswap_vget_lane_s16(__rev1_149, __p2_149)); \
+  __ret_149; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhs_laneq_s32(__p0_150, __p1_150, __p2_150) __extension__ ({ \
+  int32_t __s0_150 = __p0_150; \
+  int32x4_t __s1_150 = __p1_150; \
+  int32_t __ret_150; \
+  __ret_150 = vqdmulhs_s32(__s0_150, vgetq_lane_s32(__s1_150, __p2_150)); \
+  __ret_150; \
+})
+#else
+#define vqdmulhs_laneq_s32(__p0_151, __p1_151, __p2_151) __extension__ ({ \
+  int32_t __s0_151 = __p0_151; \
+  int32x4_t __s1_151 = __p1_151; \
+  int32x4_t __rev1_151;  __rev1_151 = __builtin_shufflevector(__s1_151, __s1_151, 3, 2, 1, 0); \
+  int32_t __ret_151; \
+  __ret_151 = __noswap_vqdmulhs_s32(__s0_151, __noswap_vgetq_lane_s32(__rev1_151, __p2_151)); \
+  __ret_151; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhh_laneq_s16(__p0_152, __p1_152, __p2_152) __extension__ ({ \
+  int16_t __s0_152 = __p0_152; \
+  int16x8_t __s1_152 = __p1_152; \
+  int16_t __ret_152; \
+  __ret_152 = vqdmulhh_s16(__s0_152, vgetq_lane_s16(__s1_152, __p2_152)); \
+  __ret_152; \
+})
+#else
+#define vqdmulhh_laneq_s16(__p0_153, __p1_153, __p2_153) __extension__ ({ \
+  int16_t __s0_153 = __p0_153; \
+  int16x8_t __s1_153 = __p1_153; \
+  int16x8_t __rev1_153;  __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_153; \
+  __ret_153 = __noswap_vqdmulhh_s16(__s0_153, __noswap_vgetq_lane_s16(__rev1_153, __p2_153)); \
+  __ret_153; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqdmulls_s32(int32_t __p0, int32_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmulls_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqdmulls_s32(int32_t __p0, int32_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmulls_s32(__p0, __p1);
+  return __ret;
+}
+__ai int64_t __noswap_vqdmulls_s32(int32_t __p0, int32_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqdmulls_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqdmullh_s16(int16_t __p0, int16_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmullh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqdmullh_s16(int16_t __p0, int16_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmullh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqdmullh_s16(int16_t __p0, int16_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqdmullh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vqdmull_s32(vget_high_s32(__p0), vget_high_s32(__p1));
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vqdmull_s16(vget_high_s16(__p0), vget_high_s16(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_lane_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_lane_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(vget_high_s32(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__noswap_vget_high_s32(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(vget_high_s16(__s0), __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_high_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__noswap_vget_high_s16(__rev0), __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqdmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int64x2_t __ret;
+  __ret = vqdmull_n_s32(vget_high_s32(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vqdmull_high_n_s32(int32x4_t __p0, int32_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vqdmull_n_s32(__noswap_vget_high_s32(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int32x4_t __ret;
+  __ret = vqdmull_n_s16(vget_high_s16(__p0), __p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vqdmull_n_s16(__noswap_vget_high_s16(__rev0), __p1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulls_lane_s32(__p0_154, __p1_154, __p2_154) __extension__ ({ \
+  int32_t __s0_154 = __p0_154; \
+  int32x2_t __s1_154 = __p1_154; \
+  int64_t __ret_154; \
+  __ret_154 = vqdmulls_s32(__s0_154, vget_lane_s32(__s1_154, __p2_154)); \
+  __ret_154; \
+})
+#else
+#define vqdmulls_lane_s32(__p0_155, __p1_155, __p2_155) __extension__ ({ \
+  int32_t __s0_155 = __p0_155; \
+  int32x2_t __s1_155 = __p1_155; \
+  int32x2_t __rev1_155;  __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \
+  int64_t __ret_155; \
+  __ret_155 = __noswap_vqdmulls_s32(__s0_155, __noswap_vget_lane_s32(__rev1_155, __p2_155)); \
+  __ret_155; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmullh_lane_s16(__p0_156, __p1_156, __p2_156) __extension__ ({ \
+  int16_t __s0_156 = __p0_156; \
+  int16x4_t __s1_156 = __p1_156; \
+  int32_t __ret_156; \
+  __ret_156 = vqdmullh_s16(__s0_156, vget_lane_s16(__s1_156, __p2_156)); \
+  __ret_156; \
+})
+#else
+#define vqdmullh_lane_s16(__p0_157, __p1_157, __p2_157) __extension__ ({ \
+  int16_t __s0_157 = __p0_157; \
+  int16x4_t __s1_157 = __p1_157; \
+  int16x4_t __rev1_157;  __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \
+  int32_t __ret_157; \
+  __ret_157 = __noswap_vqdmullh_s16(__s0_157, __noswap_vget_lane_s16(__rev1_157, __p2_157)); \
+  __ret_157; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmulls_laneq_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \
+  int32_t __s0_158 = __p0_158; \
+  int32x4_t __s1_158 = __p1_158; \
+  int64_t __ret_158; \
+  __ret_158 = vqdmulls_s32(__s0_158, vgetq_lane_s32(__s1_158, __p2_158)); \
+  __ret_158; \
+})
+#else
+#define vqdmulls_laneq_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \
+  int32_t __s0_159 = __p0_159; \
+  int32x4_t __s1_159 = __p1_159; \
+  int32x4_t __rev1_159;  __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 3, 2, 1, 0); \
+  int64_t __ret_159; \
+  __ret_159 = __noswap_vqdmulls_s32(__s0_159, __noswap_vgetq_lane_s32(__rev1_159, __p2_159)); \
+  __ret_159; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmullh_laneq_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \
+  int16_t __s0_160 = __p0_160; \
+  int16x8_t __s1_160 = __p1_160; \
+  int32_t __ret_160; \
+  __ret_160 = vqdmullh_s16(__s0_160, vgetq_lane_s16(__s1_160, __p2_160)); \
+  __ret_160; \
+})
+#else
+#define vqdmullh_laneq_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \
+  int16_t __s0_161 = __p0_161; \
+  int16x8_t __s1_161 = __p1_161; \
+  int16x8_t __rev1_161;  __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32_t __ret_161; \
+  __ret_161 = __noswap_vqdmullh_s16(__s0_161, __noswap_vgetq_lane_s16(__rev1_161, __p2_161)); \
+  __ret_161; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int64x2_t __ret; \
+  __ret = vqdmull_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __noswap_vqdmull_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqdmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqdmull_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqdmull_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqdmull_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqmovns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovns_s32(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqmovns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovns_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqmovnd_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovnd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqmovnd_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovnd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqmovnh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovnh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqmovnh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovnh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqmovns_u32(uint32_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqmovns_u32(__p0);
+  return __ret;
+}
+#else
+__ai uint16_t vqmovns_u32(uint32_t __p0) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqmovns_u32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqmovnd_u64(uint64_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqmovnd_u64(__p0);
+  return __ret;
+}
+#else
+__ai uint32_t vqmovnd_u64(uint64_t __p0) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqmovnd_u64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqmovnh_u16(uint16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqmovnh_u16(__p0);
+  return __ret;
+}
+#else
+__ai uint8_t vqmovnh_u16(uint16_t __p0) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqmovnh_u16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vqmovn_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vqmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vqmovn_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vqmovn_u64(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vqmovn_high_u64(uint32x2_t __p0, uint64x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vqmovn_u64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vqmovn_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint8x16_t vqmovn_high_u16(uint8x8_t __p0, uint16x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vqmovn_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vqmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vqmovn_s32(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vqmovn_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vqmovn_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vqmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vqmovn_s64(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vqmovn_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vqmovn_s64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vqmovn_s16(__p1));
+  return __ret;
+}
+#else
+__ai int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vqmovn_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqmovuns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovuns_s32(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqmovuns_s32(int32_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqmovuns_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqmovund_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovund_s64(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqmovund_s64(int64_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqmovund_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqmovunh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovunh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqmovunh_s16(int16_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqmovunh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16((uint16x4_t)(__p0), vqmovun_s32(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16((uint16x4_t)(__rev0), __noswap_vqmovun_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32((uint32x2_t)(__p0), vqmovun_s64(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32((uint32x2_t)(__rev0), __noswap_vqmovun_s64(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8((uint8x8_t)(__p0), vqmovun_s16(__p1));
+  return __ret;
+}
+#else
+__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8((uint8x8_t)(__rev0), __noswap_vqmovun_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vqnegq_s64(int64x2_t __p0) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqnegq_v((int8x16_t)__p0, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vqnegq_s64(int64x2_t __p0) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vqnegq_v((int8x16_t)__rev0, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vqneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vqneg_s64(int64x1_t __p0) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vqneg_v((int8x8_t)__p0, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqnegb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqnegb_s8(__p0);
+  return __ret;
+}
+#else
+__ai int8_t vqnegb_s8(int8_t __p0) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqnegb_s8(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqnegs_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqnegs_s32(__p0);
+  return __ret;
+}
+#else
+__ai int32_t vqnegs_s32(int32_t __p0) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqnegs_s32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqnegd_s64(__p0);
+  return __ret;
+}
+#else
+__ai int64_t vqnegd_s64(int64_t __p0) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqnegd_s64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqnegh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqnegh_s16(__p0);
+  return __ret;
+}
+#else
+__ai int16_t vqnegh_s16(int16_t __p0) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqnegh_s16(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqrdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqrdmulhs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrdmulhs_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqrdmulhh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrdmulhh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhs_lane_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \
+  int32_t __s0_162 = __p0_162; \
+  int32x2_t __s1_162 = __p1_162; \
+  int32_t __ret_162; \
+  __ret_162 = vqrdmulhs_s32(__s0_162, vget_lane_s32(__s1_162, __p2_162)); \
+  __ret_162; \
+})
+#else
+#define vqrdmulhs_lane_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \
+  int32_t __s0_163 = __p0_163; \
+  int32x2_t __s1_163 = __p1_163; \
+  int32x2_t __rev1_163;  __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 1, 0); \
+  int32_t __ret_163; \
+  __ret_163 = __noswap_vqrdmulhs_s32(__s0_163, __noswap_vget_lane_s32(__rev1_163, __p2_163)); \
+  __ret_163; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhh_lane_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \
+  int16_t __s0_164 = __p0_164; \
+  int16x4_t __s1_164 = __p1_164; \
+  int16_t __ret_164; \
+  __ret_164 = vqrdmulhh_s16(__s0_164, vget_lane_s16(__s1_164, __p2_164)); \
+  __ret_164; \
+})
+#else
+#define vqrdmulhh_lane_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \
+  int16_t __s0_165 = __p0_165; \
+  int16x4_t __s1_165 = __p1_165; \
+  int16x4_t __rev1_165;  __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 3, 2, 1, 0); \
+  int16_t __ret_165; \
+  __ret_165 = __noswap_vqrdmulhh_s16(__s0_165, __noswap_vget_lane_s16(__rev1_165, __p2_165)); \
+  __ret_165; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhs_laneq_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \
+  int32_t __s0_166 = __p0_166; \
+  int32x4_t __s1_166 = __p1_166; \
+  int32_t __ret_166; \
+  __ret_166 = vqrdmulhs_s32(__s0_166, vgetq_lane_s32(__s1_166, __p2_166)); \
+  __ret_166; \
+})
+#else
+#define vqrdmulhs_laneq_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \
+  int32_t __s0_167 = __p0_167; \
+  int32x4_t __s1_167 = __p1_167; \
+  int32x4_t __rev1_167;  __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 3, 2, 1, 0); \
+  int32_t __ret_167; \
+  __ret_167 = __noswap_vqrdmulhs_s32(__s0_167, __noswap_vgetq_lane_s32(__rev1_167, __p2_167)); \
+  __ret_167; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhh_laneq_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \
+  int16_t __s0_168 = __p0_168; \
+  int16x8_t __s1_168 = __p1_168; \
+  int16_t __ret_168; \
+  __ret_168 = vqrdmulhh_s16(__s0_168, vgetq_lane_s16(__s1_168, __p2_168)); \
+  __ret_168; \
+})
+#else
+#define vqrdmulhh_laneq_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \
+  int16_t __s0_169 = __p0_169; \
+  int16x8_t __s1_169 = __p1_169; \
+  int16x8_t __rev1_169;  __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_169; \
+  __ret_169 = __noswap_vqrdmulhh_s16(__s0_169, __noswap_vgetq_lane_s16(__rev1_169, __p2_169)); \
+  __ret_169; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __ret; \
+  __ret = vqrdmulhq_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __noswap_vqrdmulhq_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __ret; \
+  __ret = vqrdmulhq_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulhq_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x8_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret; \
+  __ret = __noswap_vqrdmulhq_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __ret; \
+  __ret = vqrdmulh_s32(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_laneq_s32(__p0, __p1, __p2) __extension__ ({ \
+  int32x2_t __s0 = __p0; \
+  int32x4_t __s1 = __p1; \
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int32x2_t __ret; \
+  __ret = __noswap_vqrdmulh_s32(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __ret; \
+  __ret = vqrdmulh_s16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \
+  __ret; \
+})
+#else
+#define vqrdmulh_laneq_s16(__p0, __p1, __p2) __extension__ ({ \
+  int16x4_t __s0 = __p0; \
+  int16x8_t __s1 = __p1; \
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x4_t __ret; \
+  __ret = __noswap_vqrdmulh_s16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqrshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqrshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqrshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqrshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqrshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqrshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqrshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqrshlb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqrshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqrshlb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrshls_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqrshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqrshls_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqrshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqrshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrshlh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqrshlh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_u32(__p0_170, __p1_170, __p2_170) __extension__ ({ \
+  uint16x4_t __s0_170 = __p0_170; \
+  uint32x4_t __s1_170 = __p1_170; \
+  uint16x8_t __ret_170; \
+  __ret_170 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_170), (uint16x4_t)(vqrshrn_n_u32(__s1_170, __p2_170)))); \
+  __ret_170; \
+})
+#else
+#define vqrshrn_high_n_u32(__p0_171, __p1_171, __p2_171) __extension__ ({ \
+  uint16x4_t __s0_171 = __p0_171; \
+  uint32x4_t __s1_171 = __p1_171; \
+  uint16x4_t __rev0_171;  __rev0_171 = __builtin_shufflevector(__s0_171, __s0_171, 3, 2, 1, 0); \
+  uint32x4_t __rev1_171;  __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \
+  uint16x8_t __ret_171; \
+  __ret_171 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_171), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_171, __p2_171)))); \
+  __ret_171 = __builtin_shufflevector(__ret_171, __ret_171, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_171; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_u64(__p0_172, __p1_172, __p2_172) __extension__ ({ \
+  uint32x2_t __s0_172 = __p0_172; \
+  uint64x2_t __s1_172 = __p1_172; \
+  uint32x4_t __ret_172; \
+  __ret_172 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_172), (uint32x2_t)(vqrshrn_n_u64(__s1_172, __p2_172)))); \
+  __ret_172; \
+})
+#else
+#define vqrshrn_high_n_u64(__p0_173, __p1_173, __p2_173) __extension__ ({ \
+  uint32x2_t __s0_173 = __p0_173; \
+  uint64x2_t __s1_173 = __p1_173; \
+  uint32x2_t __rev0_173;  __rev0_173 = __builtin_shufflevector(__s0_173, __s0_173, 1, 0); \
+  uint64x2_t __rev1_173;  __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 1, 0); \
+  uint32x4_t __ret_173; \
+  __ret_173 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_173), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_173, __p2_173)))); \
+  __ret_173 = __builtin_shufflevector(__ret_173, __ret_173, 3, 2, 1, 0); \
+  __ret_173; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_u16(__p0_174, __p1_174, __p2_174) __extension__ ({ \
+  uint8x8_t __s0_174 = __p0_174; \
+  uint16x8_t __s1_174 = __p1_174; \
+  uint8x16_t __ret_174; \
+  __ret_174 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_174), (uint8x8_t)(vqrshrn_n_u16(__s1_174, __p2_174)))); \
+  __ret_174; \
+})
+#else
+#define vqrshrn_high_n_u16(__p0_175, __p1_175, __p2_175) __extension__ ({ \
+  uint8x8_t __s0_175 = __p0_175; \
+  uint16x8_t __s1_175 = __p1_175; \
+  uint8x8_t __rev0_175;  __rev0_175 = __builtin_shufflevector(__s0_175, __s0_175, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_175;  __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_175; \
+  __ret_175 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_175), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_175, __p2_175)))); \
+  __ret_175 = __builtin_shufflevector(__ret_175, __ret_175, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_175; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_s32(__p0_176, __p1_176, __p2_176) __extension__ ({ \
+  int16x4_t __s0_176 = __p0_176; \
+  int32x4_t __s1_176 = __p1_176; \
+  int16x8_t __ret_176; \
+  __ret_176 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_176), (int16x4_t)(vqrshrn_n_s32(__s1_176, __p2_176)))); \
+  __ret_176; \
+})
+#else
+#define vqrshrn_high_n_s32(__p0_177, __p1_177, __p2_177) __extension__ ({ \
+  int16x4_t __s0_177 = __p0_177; \
+  int32x4_t __s1_177 = __p1_177; \
+  int16x4_t __rev0_177;  __rev0_177 = __builtin_shufflevector(__s0_177, __s0_177, 3, 2, 1, 0); \
+  int32x4_t __rev1_177;  __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \
+  int16x8_t __ret_177; \
+  __ret_177 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_177), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_177, __p2_177)))); \
+  __ret_177 = __builtin_shufflevector(__ret_177, __ret_177, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_177; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_s64(__p0_178, __p1_178, __p2_178) __extension__ ({ \
+  int32x2_t __s0_178 = __p0_178; \
+  int64x2_t __s1_178 = __p1_178; \
+  int32x4_t __ret_178; \
+  __ret_178 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_178), (int32x2_t)(vqrshrn_n_s64(__s1_178, __p2_178)))); \
+  __ret_178; \
+})
+#else
+#define vqrshrn_high_n_s64(__p0_179, __p1_179, __p2_179) __extension__ ({ \
+  int32x2_t __s0_179 = __p0_179; \
+  int64x2_t __s1_179 = __p1_179; \
+  int32x2_t __rev0_179;  __rev0_179 = __builtin_shufflevector(__s0_179, __s0_179, 1, 0); \
+  int64x2_t __rev1_179;  __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 1, 0); \
+  int32x4_t __ret_179; \
+  __ret_179 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_179), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_179, __p2_179)))); \
+  __ret_179 = __builtin_shufflevector(__ret_179, __ret_179, 3, 2, 1, 0); \
+  __ret_179; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrn_high_n_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \
+  int8x8_t __s0_180 = __p0_180; \
+  int16x8_t __s1_180 = __p1_180; \
+  int8x16_t __ret_180; \
+  __ret_180 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_180), (int8x8_t)(vqrshrn_n_s16(__s1_180, __p2_180)))); \
+  __ret_180; \
+})
+#else
+#define vqrshrn_high_n_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \
+  int8x8_t __s0_181 = __p0_181; \
+  int16x8_t __s1_181 = __p1_181; \
+  int8x8_t __rev0_181;  __rev0_181 = __builtin_shufflevector(__s0_181, __s0_181, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_181;  __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_181; \
+  __ret_181 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_181), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_181, __p2_181)))); \
+  __ret_181 = __builtin_shufflevector(__ret_181, __ret_181, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_181; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_high_n_s32(__p0_182, __p1_182, __p2_182) __extension__ ({ \
+  int16x4_t __s0_182 = __p0_182; \
+  int32x4_t __s1_182 = __p1_182; \
+  int16x8_t __ret_182; \
+  __ret_182 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_182), (int16x4_t)(vqrshrun_n_s32(__s1_182, __p2_182)))); \
+  __ret_182; \
+})
+#else
+#define vqrshrun_high_n_s32(__p0_183, __p1_183, __p2_183) __extension__ ({ \
+  int16x4_t __s0_183 = __p0_183; \
+  int32x4_t __s1_183 = __p1_183; \
+  int16x4_t __rev0_183;  __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \
+  int32x4_t __rev1_183;  __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \
+  int16x8_t __ret_183; \
+  __ret_183 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_183), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_183, __p2_183)))); \
+  __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_183; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_high_n_s64(__p0_184, __p1_184, __p2_184) __extension__ ({ \
+  int32x2_t __s0_184 = __p0_184; \
+  int64x2_t __s1_184 = __p1_184; \
+  int32x4_t __ret_184; \
+  __ret_184 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_184), (int32x2_t)(vqrshrun_n_s64(__s1_184, __p2_184)))); \
+  __ret_184; \
+})
+#else
+#define vqrshrun_high_n_s64(__p0_185, __p1_185, __p2_185) __extension__ ({ \
+  int32x2_t __s0_185 = __p0_185; \
+  int64x2_t __s1_185 = __p1_185; \
+  int32x2_t __rev0_185;  __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \
+  int64x2_t __rev1_185;  __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \
+  int32x4_t __ret_185; \
+  __ret_185 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_185), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_185, __p2_185)))); \
+  __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \
+  __ret_185; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrun_high_n_s16(__p0_186, __p1_186, __p2_186) __extension__ ({ \
+  int8x8_t __s0_186 = __p0_186; \
+  int16x8_t __s1_186 = __p1_186; \
+  int8x16_t __ret_186; \
+  __ret_186 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_186), (int8x8_t)(vqrshrun_n_s16(__s1_186, __p2_186)))); \
+  __ret_186; \
+})
+#else
+#define vqrshrun_high_n_s16(__p0_187, __p1_187, __p2_187) __extension__ ({ \
+  int8x8_t __s0_187 = __p0_187; \
+  int16x8_t __s1_187 = __p1_187; \
+  int8x8_t __rev0_187;  __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_187;  __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_187; \
+  __ret_187 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_187), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_187, __p2_187)))); \
+  __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_187; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \
+  uint8_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshls_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshls_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \
+  int8_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_u32(__p0_188, __p1_188, __p2_188) __extension__ ({ \
+  uint16x4_t __s0_188 = __p0_188; \
+  uint32x4_t __s1_188 = __p1_188; \
+  uint16x8_t __ret_188; \
+  __ret_188 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_188), (uint16x4_t)(vqshrn_n_u32(__s1_188, __p2_188)))); \
+  __ret_188; \
+})
+#else
+#define vqshrn_high_n_u32(__p0_189, __p1_189, __p2_189) __extension__ ({ \
+  uint16x4_t __s0_189 = __p0_189; \
+  uint32x4_t __s1_189 = __p1_189; \
+  uint16x4_t __rev0_189;  __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \
+  uint32x4_t __rev1_189;  __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \
+  uint16x8_t __ret_189; \
+  __ret_189 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_189), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_189, __p2_189)))); \
+  __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_189; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_u64(__p0_190, __p1_190, __p2_190) __extension__ ({ \
+  uint32x2_t __s0_190 = __p0_190; \
+  uint64x2_t __s1_190 = __p1_190; \
+  uint32x4_t __ret_190; \
+  __ret_190 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_190), (uint32x2_t)(vqshrn_n_u64(__s1_190, __p2_190)))); \
+  __ret_190; \
+})
+#else
+#define vqshrn_high_n_u64(__p0_191, __p1_191, __p2_191) __extension__ ({ \
+  uint32x2_t __s0_191 = __p0_191; \
+  uint64x2_t __s1_191 = __p1_191; \
+  uint32x2_t __rev0_191;  __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \
+  uint64x2_t __rev1_191;  __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \
+  uint32x4_t __ret_191; \
+  __ret_191 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_191), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_191, __p2_191)))); \
+  __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 3, 2, 1, 0); \
+  __ret_191; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_u16(__p0_192, __p1_192, __p2_192) __extension__ ({ \
+  uint8x8_t __s0_192 = __p0_192; \
+  uint16x8_t __s1_192 = __p1_192; \
+  uint8x16_t __ret_192; \
+  __ret_192 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_192), (uint8x8_t)(vqshrn_n_u16(__s1_192, __p2_192)))); \
+  __ret_192; \
+})
+#else
+#define vqshrn_high_n_u16(__p0_193, __p1_193, __p2_193) __extension__ ({ \
+  uint8x8_t __s0_193 = __p0_193; \
+  uint16x8_t __s1_193 = __p1_193; \
+  uint8x8_t __rev0_193;  __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_193;  __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_193; \
+  __ret_193 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_193), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_193, __p2_193)))); \
+  __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_193; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \
+  int16x4_t __s0_194 = __p0_194; \
+  int32x4_t __s1_194 = __p1_194; \
+  int16x8_t __ret_194; \
+  __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqshrn_n_s32(__s1_194, __p2_194)))); \
+  __ret_194; \
+})
+#else
+#define vqshrn_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \
+  int16x4_t __s0_195 = __p0_195; \
+  int32x4_t __s1_195 = __p1_195; \
+  int16x4_t __rev0_195;  __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 3, 2, 1, 0); \
+  int32x4_t __rev1_195;  __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 3, 2, 1, 0); \
+  int16x8_t __ret_195; \
+  __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_195, __p2_195)))); \
+  __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_195; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \
+  int32x2_t __s0_196 = __p0_196; \
+  int64x2_t __s1_196 = __p1_196; \
+  int32x4_t __ret_196; \
+  __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqshrn_n_s64(__s1_196, __p2_196)))); \
+  __ret_196; \
+})
+#else
+#define vqshrn_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \
+  int32x2_t __s0_197 = __p0_197; \
+  int64x2_t __s1_197 = __p1_197; \
+  int32x2_t __rev0_197;  __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 1, 0); \
+  int64x2_t __rev1_197;  __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 1, 0); \
+  int32x4_t __ret_197; \
+  __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_197, __p2_197)))); \
+  __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \
+  __ret_197; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrn_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \
+  int8x8_t __s0_198 = __p0_198; \
+  int16x8_t __s1_198 = __p1_198; \
+  int8x16_t __ret_198; \
+  __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqshrn_n_s16(__s1_198, __p2_198)))); \
+  __ret_198; \
+})
+#else
+#define vqshrn_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \
+  int8x8_t __s0_199 = __p0_199; \
+  int16x8_t __s1_199 = __p1_199; \
+  int8x8_t __rev0_199;  __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_199;  __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_199; \
+  __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_199, __p2_199)))); \
+  __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_199; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \
+  uint32_t __s0 = __p0; \
+  uint16_t __ret; \
+  __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint32_t __ret; \
+  __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \
+  uint16_t __s0 = __p0; \
+  uint8_t __ret; \
+  __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s32(__p0_200, __p1_200, __p2_200) __extension__ ({ \
+  int16x4_t __s0_200 = __p0_200; \
+  int32x4_t __s1_200 = __p1_200; \
+  int16x8_t __ret_200; \
+  __ret_200 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_200), (int16x4_t)(vqshrun_n_s32(__s1_200, __p2_200)))); \
+  __ret_200; \
+})
+#else
+#define vqshrun_high_n_s32(__p0_201, __p1_201, __p2_201) __extension__ ({ \
+  int16x4_t __s0_201 = __p0_201; \
+  int32x4_t __s1_201 = __p1_201; \
+  int16x4_t __rev0_201;  __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \
+  int32x4_t __rev1_201;  __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \
+  int16x8_t __ret_201; \
+  __ret_201 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_201), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_201, __p2_201)))); \
+  __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_201; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s64(__p0_202, __p1_202, __p2_202) __extension__ ({ \
+  int32x2_t __s0_202 = __p0_202; \
+  int64x2_t __s1_202 = __p1_202; \
+  int32x4_t __ret_202; \
+  __ret_202 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_202), (int32x2_t)(vqshrun_n_s64(__s1_202, __p2_202)))); \
+  __ret_202; \
+})
+#else
+#define vqshrun_high_n_s64(__p0_203, __p1_203, __p2_203) __extension__ ({ \
+  int32x2_t __s0_203 = __p0_203; \
+  int64x2_t __s1_203 = __p1_203; \
+  int32x2_t __rev0_203;  __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \
+  int64x2_t __rev1_203;  __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \
+  int32x4_t __ret_203; \
+  __ret_203 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_203), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_203, __p2_203)))); \
+  __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 3, 2, 1, 0); \
+  __ret_203; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrun_high_n_s16(__p0_204, __p1_204, __p2_204) __extension__ ({ \
+  int8x8_t __s0_204 = __p0_204; \
+  int16x8_t __s1_204 = __p1_204; \
+  int8x16_t __ret_204; \
+  __ret_204 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_204), (int8x8_t)(vqshrun_n_s16(__s1_204, __p2_204)))); \
+  __ret_204; \
+})
+#else
+#define vqshrun_high_n_s16(__p0_205, __p1_205, __p2_205) __extension__ ({ \
+  int8x8_t __s0_205 = __p0_205; \
+  int16x8_t __s1_205 = __p1_205; \
+  int8x8_t __rev0_205;  __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_205;  __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_205; \
+  __ret_205 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_205), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_205, __p2_205)))); \
+  __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_205; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshruns_n_s32(__p0, __p1) __extension__ ({ \
+  int32_t __s0 = __p0; \
+  int16_t __ret; \
+  __ret = (int16_t) __builtin_neon_vqshruns_n_s32(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrund_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int32_t __ret; \
+  __ret = (int32_t) __builtin_neon_vqshrund_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vqshrunh_n_s16(__p0, __p1) __extension__ ({ \
+  int16_t __s0 = __p0; \
+  int8_t __ret; \
+  __ret = (int8_t) __builtin_neon_vqshrunh_n_s16(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vqsubb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqsubb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vqsubb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vqsubb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vqsubs_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqsubs_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vqsubs_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vqsubs_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vqsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqsubd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vqsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vqsubd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vqsubh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqsubh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vqsubh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vqsubh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vqsubb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqsubb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vqsubb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vqsubb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqsubs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqsubs_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vqsubs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqsubs_s32(__p0, __p1);
+  return __ret;
+}
+__ai int32_t __noswap_vqsubs_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vqsubs_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vqsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqsubd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vqsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vqsubd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqsubh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqsubh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vqsubh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqsubh_s16(__p0, __p1);
+  return __ret;
+}
+__ai int16_t __noswap_vqsubh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vqsubh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl1_p8(poly8x16_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl1_p8(poly8x16_t __p0, uint8x8_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl1q_p8(poly8x16_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl1q_p8(poly8x16_t __p0, uint8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl1_u8(uint8x16_t __p0, uint8x8_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl1_s8(int8x16_t __p0, int8x8_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl1_v((int8x16_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl2_p8(poly8x16x2_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl2_p8(poly8x16x2_t __p0, uint8x8_t __p1) {
+  poly8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl2q_p8(poly8x16x2_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl2q_p8(poly8x16x2_t __p0, uint8x16_t __p1) {
+  poly8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl2q_u8(uint8x16x2_t __p0, uint8x16_t __p1) {
+  uint8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl2q_s8(int8x16x2_t __p0, int8x16_t __p1) {
+  int8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl2q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl2_u8(uint8x16x2_t __p0, uint8x8_t __p1) {
+  uint8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl2_s8(int8x16x2_t __p0, int8x8_t __p1) {
+  int8x16x2_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl2_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl3_p8(poly8x16x3_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl3_p8(poly8x16x3_t __p0, uint8x8_t __p1) {
+  poly8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl3q_p8(poly8x16x3_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl3q_p8(poly8x16x3_t __p0, uint8x16_t __p1) {
+  poly8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl3q_u8(uint8x16x3_t __p0, uint8x16_t __p1) {
+  uint8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl3q_s8(int8x16x3_t __p0, int8x16_t __p1) {
+  int8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl3q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl3_u8(uint8x16x3_t __p0, uint8x8_t __p1) {
+  uint8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl3_s8(int8x16x3_t __p0, int8x8_t __p1) {
+  int8x16x3_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl3_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbl4_p8(poly8x16x4_t __p0, uint8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbl4_p8(poly8x16x4_t __p0, uint8x8_t __p1) {
+  poly8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbl4q_p8(poly8x16x4_t __p0, uint8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbl4q_p8(poly8x16x4_t __p0, uint8x16_t __p1) {
+  poly8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbl4q_u8(uint8x16x4_t __p0, uint8x16_t __p1) {
+  uint8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbl4q_s8(int8x16x4_t __p0, int8x16_t __p1) {
+  int8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbl4q_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbl4_u8(uint8x16x4_t __p0, uint8x8_t __p1) {
+  uint8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__p0.val[0], (int8x16_t)__p0.val[1], (int8x16_t)__p0.val[2], (int8x16_t)__p0.val[3], (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbl4_s8(int8x16x4_t __p0, int8x8_t __p1) {
+  int8x16x4_t __rev0;
+  __rev0.val[0] = __builtin_shufflevector(__p0.val[0], __p0.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[1] = __builtin_shufflevector(__p0.val[1], __p0.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[2] = __builtin_shufflevector(__p0.val[2], __p0.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev0.val[3] = __builtin_shufflevector(__p0.val[3], __p0.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbl4_v((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev0.val[2], (int8x16_t)__rev0.val[3], (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx1_p8(poly8x8_t __p0, poly8x16_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx1_p8(poly8x8_t __p0, poly8x16_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx1q_p8(poly8x16_t __p0, poly8x16_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx1q_p8(poly8x16_t __p0, poly8x16_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx1q_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx1q_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx1_u8(uint8x8_t __p0, uint8x16_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__p0, (int8x16_t)__p1, (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx1_s8(int8x8_t __p0, int8x16_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx1_v((int8x8_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx2_p8(poly8x8_t __p0, poly8x16x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx2_p8(poly8x8_t __p0, poly8x16x2_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx2q_p8(poly8x16_t __p0, poly8x16x2_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx2q_p8(poly8x16_t __p0, poly8x16x2_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx2q_u8(uint8x16_t __p0, uint8x16x2_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx2q_s8(int8x16_t __p0, int8x16x2_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx2q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx2_u8(uint8x8_t __p0, uint8x16x2_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx2_s8(int8x8_t __p0, int8x16x2_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x2_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx2_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx3_p8(poly8x8_t __p0, poly8x16x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx3_p8(poly8x8_t __p0, poly8x16x3_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx3q_p8(poly8x16_t __p0, poly8x16x3_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx3q_p8(poly8x16_t __p0, poly8x16x3_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx3q_u8(uint8x16_t __p0, uint8x16x3_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx3q_s8(int8x16_t __p0, int8x16x3_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx3q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx3_u8(uint8x8_t __p0, uint8x16x3_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx3_s8(int8x8_t __p0, int8x16x3_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x3_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx3_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vqtbx4_p8(poly8x8_t __p0, poly8x16x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vqtbx4_p8(poly8x8_t __p0, poly8x16x4_t __p1, uint8x8_t __p2) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vqtbx4q_p8(poly8x16_t __p0, poly8x16x4_t __p1, uint8x16_t __p2) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vqtbx4q_p8(poly8x16_t __p0, poly8x16x4_t __p1, uint8x16_t __p2) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vqtbx4q_u8(uint8x16_t __p0, uint8x16x4_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x16_t)__p2, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vqtbx4q_s8(int8x16_t __p0, int8x16x4_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vqtbx4q_v((int8x16_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x16_t)__rev2, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vqtbx4_u8(uint8x8_t __p0, uint8x16x4_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__p0, (int8x16_t)__p1.val[0], (int8x16_t)__p1.val[1], (int8x16_t)__p1.val[2], (int8x16_t)__p1.val[3], (int8x8_t)__p2, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vqtbx4_s8(int8x8_t __p0, int8x16x4_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16x4_t __rev1;
+  __rev1.val[0] = __builtin_shufflevector(__p1.val[0], __p1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[1] = __builtin_shufflevector(__p1.val[1], __p1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[2] = __builtin_shufflevector(__p1.val[2], __p1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __rev1.val[3] = __builtin_shufflevector(__p1.val[3], __p1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vqtbx4_v((int8x8_t)__rev0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], (int8x8_t)__rev2, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vraddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vraddhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vraddhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vraddhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vraddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vraddhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vraddhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vraddhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vraddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vraddhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vraddhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vraddhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vraddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vraddhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vraddhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vraddhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vraddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vraddhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vraddhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vraddhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vraddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vraddhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vraddhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vraddhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vrbit_p8(poly8x8_t __p0) {
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 4);
+  return __ret;
+}
+#else
+__ai poly8x8_t vrbit_p8(poly8x8_t __p0) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = (poly8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 4);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vrbitq_p8(poly8x16_t __p0) {
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 36);
+  return __ret;
+}
+#else
+__ai poly8x16_t vrbitq_p8(poly8x16_t __p0) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = (poly8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 36);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrbitq_u8(uint8x16_t __p0) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vrbitq_u8(uint8x16_t __p0) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrbitq_s8(int8x16_t __p0) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__p0, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vrbitq_s8(int8x16_t __p0) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vrbitq_v((int8x16_t)__rev0, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vrbit_u8(uint8x8_t __p0) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vrbit_u8(uint8x8_t __p0) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vrbit_s8(int8x8_t __p0) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrbit_v((int8x8_t)__p0, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vrbit_s8(int8x8_t __p0) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vrbit_v((int8x8_t)__rev0, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrecpeq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrecpeq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrecpe_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrecpe_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrecped_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecped_f64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vrecped_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecped_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrecpes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpes_f32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vrecpes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpes_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrecpsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrecpsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrecps_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrecps_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrecpsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpsd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vrecpsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpsd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrecpss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpss_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vrecpss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpss_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrecpxd_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpxd_f64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vrecpxd_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrecpxd_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrecpxs_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vrecpxs_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vrshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vrshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vrshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vrshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vrshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vrshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_u32(__p0_206, __p1_206, __p2_206) __extension__ ({ \
+  uint16x4_t __s0_206 = __p0_206; \
+  uint32x4_t __s1_206 = __p1_206; \
+  uint16x8_t __ret_206; \
+  __ret_206 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_206), (uint16x4_t)(vrshrn_n_u32(__s1_206, __p2_206)))); \
+  __ret_206; \
+})
+#else
+#define vrshrn_high_n_u32(__p0_207, __p1_207, __p2_207) __extension__ ({ \
+  uint16x4_t __s0_207 = __p0_207; \
+  uint32x4_t __s1_207 = __p1_207; \
+  uint16x4_t __rev0_207;  __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \
+  uint32x4_t __rev1_207;  __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \
+  uint16x8_t __ret_207; \
+  __ret_207 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_207), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_207, __p2_207)))); \
+  __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_207; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_u64(__p0_208, __p1_208, __p2_208) __extension__ ({ \
+  uint32x2_t __s0_208 = __p0_208; \
+  uint64x2_t __s1_208 = __p1_208; \
+  uint32x4_t __ret_208; \
+  __ret_208 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_208), (uint32x2_t)(vrshrn_n_u64(__s1_208, __p2_208)))); \
+  __ret_208; \
+})
+#else
+#define vrshrn_high_n_u64(__p0_209, __p1_209, __p2_209) __extension__ ({ \
+  uint32x2_t __s0_209 = __p0_209; \
+  uint64x2_t __s1_209 = __p1_209; \
+  uint32x2_t __rev0_209;  __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \
+  uint64x2_t __rev1_209;  __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \
+  uint32x4_t __ret_209; \
+  __ret_209 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_209), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_209, __p2_209)))); \
+  __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \
+  __ret_209; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_u16(__p0_210, __p1_210, __p2_210) __extension__ ({ \
+  uint8x8_t __s0_210 = __p0_210; \
+  uint16x8_t __s1_210 = __p1_210; \
+  uint8x16_t __ret_210; \
+  __ret_210 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_210), (uint8x8_t)(vrshrn_n_u16(__s1_210, __p2_210)))); \
+  __ret_210; \
+})
+#else
+#define vrshrn_high_n_u16(__p0_211, __p1_211, __p2_211) __extension__ ({ \
+  uint8x8_t __s0_211 = __p0_211; \
+  uint16x8_t __s1_211 = __p1_211; \
+  uint8x8_t __rev0_211;  __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_211;  __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_211; \
+  __ret_211 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_211), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_211, __p2_211)))); \
+  __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_211; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \
+  int16x4_t __s0_212 = __p0_212; \
+  int32x4_t __s1_212 = __p1_212; \
+  int16x8_t __ret_212; \
+  __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vrshrn_n_s32(__s1_212, __p2_212)))); \
+  __ret_212; \
+})
+#else
+#define vrshrn_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \
+  int16x4_t __s0_213 = __p0_213; \
+  int32x4_t __s1_213 = __p1_213; \
+  int16x4_t __rev0_213;  __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \
+  int32x4_t __rev1_213;  __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \
+  int16x8_t __ret_213; \
+  __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_213, __p2_213)))); \
+  __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_213; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \
+  int32x2_t __s0_214 = __p0_214; \
+  int64x2_t __s1_214 = __p1_214; \
+  int32x4_t __ret_214; \
+  __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vrshrn_n_s64(__s1_214, __p2_214)))); \
+  __ret_214; \
+})
+#else
+#define vrshrn_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \
+  int32x2_t __s0_215 = __p0_215; \
+  int64x2_t __s1_215 = __p1_215; \
+  int32x2_t __rev0_215;  __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \
+  int64x2_t __rev1_215;  __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \
+  int32x4_t __ret_215; \
+  __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_215, __p2_215)))); \
+  __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \
+  __ret_215; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrshrn_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \
+  int8x8_t __s0_216 = __p0_216; \
+  int16x8_t __s1_216 = __p1_216; \
+  int8x16_t __ret_216; \
+  __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vrshrn_n_s16(__s1_216, __p2_216)))); \
+  __ret_216; \
+})
+#else
+#define vrshrn_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \
+  int8x8_t __s0_217 = __p0_217; \
+  int16x8_t __s1_217 = __p1_217; \
+  int8x8_t __rev0_217;  __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_217;  __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_217; \
+  __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_217, __p2_217)))); \
+  __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_217; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrsqrteq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrsqrteq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrsqrte_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrsqrte_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrsqrted_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrted_f64(__p0);
+  return __ret;
+}
+#else
+__ai float64_t vrsqrted_f64(float64_t __p0) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrted_f64(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrsqrtes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtes_f32(__p0);
+  return __ret;
+}
+#else
+__ai float32_t vrsqrtes_f32(float32_t __p0) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtes_f32(__p0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vrsqrtsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vrsqrtsq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vrsqrts_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vrsqrts_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64_t vrsqrtsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrtsd_f64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float64_t vrsqrtsd_f64(float64_t __p0, float64_t __p1) {
+  float64_t __ret;
+  __ret = (float64_t) __builtin_neon_vrsqrtsd_f64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32_t vrsqrtss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtss_f32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai float32_t vrsqrtss_f32(float32_t __p0, float32_t __p1) {
+  float32_t __ret;
+  __ret = (float32_t) __builtin_neon_vrsqrtss_f32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vrsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vrsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vrsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vrsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vrsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vrsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vrsubhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vrsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vrsubhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vrsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vrsubhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vrsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vrsubhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vrsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vrsubhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vrsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vrsubhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vrsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vrsubhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vrsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vrsubhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vrsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vrsubhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vrsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vrsubhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vrsubhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vrsubhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vset_lane_i64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (int8x16_t)__rev1, __p2); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#define __noswap_vsetq_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __ret; \
+  __ret = (float64x2_t) __builtin_neon_vsetq_lane_f64(__s0, (int8x16_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#else
+#define vset_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#define __noswap_vset_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64_t __s0 = __p0; \
+  float64x1_t __s1 = __p1; \
+  float64x1_t __ret; \
+  __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (int8x8_t)__s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vshld_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vshld_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vshld_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vshld_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshld_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshld_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshld_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshld_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_u8(__p0_218, __p1_218) __extension__ ({ \
+  uint8x16_t __s0_218 = __p0_218; \
+  uint16x8_t __ret_218; \
+  __ret_218 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_218), __p1_218)); \
+  __ret_218; \
+})
+#else
+#define vshll_high_n_u8(__p0_219, __p1_219) __extension__ ({ \
+  uint8x16_t __s0_219 = __p0_219; \
+  uint8x16_t __rev0_219;  __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __ret_219; \
+  __ret_219 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_219), __p1_219)); \
+  __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_219; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_u32(__p0_220, __p1_220) __extension__ ({ \
+  uint32x4_t __s0_220 = __p0_220; \
+  uint64x2_t __ret_220; \
+  __ret_220 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_220), __p1_220)); \
+  __ret_220; \
+})
+#else
+#define vshll_high_n_u32(__p0_221, __p1_221) __extension__ ({ \
+  uint32x4_t __s0_221 = __p0_221; \
+  uint32x4_t __rev0_221;  __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 3, 2, 1, 0); \
+  uint64x2_t __ret_221; \
+  __ret_221 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_221), __p1_221)); \
+  __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 1, 0); \
+  __ret_221; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_u16(__p0_222, __p1_222) __extension__ ({ \
+  uint16x8_t __s0_222 = __p0_222; \
+  uint32x4_t __ret_222; \
+  __ret_222 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_222), __p1_222)); \
+  __ret_222; \
+})
+#else
+#define vshll_high_n_u16(__p0_223, __p1_223) __extension__ ({ \
+  uint16x8_t __s0_223 = __p0_223; \
+  uint16x8_t __rev0_223;  __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint32x4_t __ret_223; \
+  __ret_223 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_223), __p1_223)); \
+  __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 3, 2, 1, 0); \
+  __ret_223; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_s8(__p0_224, __p1_224) __extension__ ({ \
+  int8x16_t __s0_224 = __p0_224; \
+  int16x8_t __ret_224; \
+  __ret_224 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_224), __p1_224)); \
+  __ret_224; \
+})
+#else
+#define vshll_high_n_s8(__p0_225, __p1_225) __extension__ ({ \
+  int8x16_t __s0_225 = __p0_225; \
+  int8x16_t __rev0_225;  __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __ret_225; \
+  __ret_225 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_225), __p1_225)); \
+  __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_225; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_s32(__p0_226, __p1_226) __extension__ ({ \
+  int32x4_t __s0_226 = __p0_226; \
+  int64x2_t __ret_226; \
+  __ret_226 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_226), __p1_226)); \
+  __ret_226; \
+})
+#else
+#define vshll_high_n_s32(__p0_227, __p1_227) __extension__ ({ \
+  int32x4_t __s0_227 = __p0_227; \
+  int32x4_t __rev0_227;  __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 3, 2, 1, 0); \
+  int64x2_t __ret_227; \
+  __ret_227 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_227), __p1_227)); \
+  __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 1, 0); \
+  __ret_227; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshll_high_n_s16(__p0_228, __p1_228) __extension__ ({ \
+  int16x8_t __s0_228 = __p0_228; \
+  int32x4_t __ret_228; \
+  __ret_228 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_228), __p1_228)); \
+  __ret_228; \
+})
+#else
+#define vshll_high_n_s16(__p0_229, __p1_229) __extension__ ({ \
+  int16x8_t __s0_229 = __p0_229; \
+  int16x8_t __rev0_229;  __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int32x4_t __ret_229; \
+  __ret_229 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_229), __p1_229)); \
+  __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 3, 2, 1, 0); \
+  __ret_229; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshrd_n_u64(__p0, __p1) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#else
+#define vshrd_n_s64(__p0, __p1) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u32(__p0_230, __p1_230, __p2_230) __extension__ ({ \
+  uint16x4_t __s0_230 = __p0_230; \
+  uint32x4_t __s1_230 = __p1_230; \
+  uint16x8_t __ret_230; \
+  __ret_230 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_230), (uint16x4_t)(vshrn_n_u32(__s1_230, __p2_230)))); \
+  __ret_230; \
+})
+#else
+#define vshrn_high_n_u32(__p0_231, __p1_231, __p2_231) __extension__ ({ \
+  uint16x4_t __s0_231 = __p0_231; \
+  uint32x4_t __s1_231 = __p1_231; \
+  uint16x4_t __rev0_231;  __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 3, 2, 1, 0); \
+  uint32x4_t __rev1_231;  __rev1_231 = __builtin_shufflevector(__s1_231, __s1_231, 3, 2, 1, 0); \
+  uint16x8_t __ret_231; \
+  __ret_231 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_231), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_231, __p2_231)))); \
+  __ret_231 = __builtin_shufflevector(__ret_231, __ret_231, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_231; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u64(__p0_232, __p1_232, __p2_232) __extension__ ({ \
+  uint32x2_t __s0_232 = __p0_232; \
+  uint64x2_t __s1_232 = __p1_232; \
+  uint32x4_t __ret_232; \
+  __ret_232 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_232), (uint32x2_t)(vshrn_n_u64(__s1_232, __p2_232)))); \
+  __ret_232; \
+})
+#else
+#define vshrn_high_n_u64(__p0_233, __p1_233, __p2_233) __extension__ ({ \
+  uint32x2_t __s0_233 = __p0_233; \
+  uint64x2_t __s1_233 = __p1_233; \
+  uint32x2_t __rev0_233;  __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 1, 0); \
+  uint64x2_t __rev1_233;  __rev1_233 = __builtin_shufflevector(__s1_233, __s1_233, 1, 0); \
+  uint32x4_t __ret_233; \
+  __ret_233 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_233), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_233, __p2_233)))); \
+  __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 3, 2, 1, 0); \
+  __ret_233; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_u16(__p0_234, __p1_234, __p2_234) __extension__ ({ \
+  uint8x8_t __s0_234 = __p0_234; \
+  uint16x8_t __s1_234 = __p1_234; \
+  uint8x16_t __ret_234; \
+  __ret_234 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_234), (uint8x8_t)(vshrn_n_u16(__s1_234, __p2_234)))); \
+  __ret_234; \
+})
+#else
+#define vshrn_high_n_u16(__p0_235, __p1_235, __p2_235) __extension__ ({ \
+  uint8x8_t __s0_235 = __p0_235; \
+  uint16x8_t __s1_235 = __p1_235; \
+  uint8x8_t __rev0_235;  __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint16x8_t __rev1_235;  __rev1_235 = __builtin_shufflevector(__s1_235, __s1_235, 7, 6, 5, 4, 3, 2, 1, 0); \
+  uint8x16_t __ret_235; \
+  __ret_235 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_235), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_235, __p2_235)))); \
+  __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_235; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s32(__p0_236, __p1_236, __p2_236) __extension__ ({ \
+  int16x4_t __s0_236 = __p0_236; \
+  int32x4_t __s1_236 = __p1_236; \
+  int16x8_t __ret_236; \
+  __ret_236 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_236), (int16x4_t)(vshrn_n_s32(__s1_236, __p2_236)))); \
+  __ret_236; \
+})
+#else
+#define vshrn_high_n_s32(__p0_237, __p1_237, __p2_237) __extension__ ({ \
+  int16x4_t __s0_237 = __p0_237; \
+  int32x4_t __s1_237 = __p1_237; \
+  int16x4_t __rev0_237;  __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 3, 2, 1, 0); \
+  int32x4_t __rev1_237;  __rev1_237 = __builtin_shufflevector(__s1_237, __s1_237, 3, 2, 1, 0); \
+  int16x8_t __ret_237; \
+  __ret_237 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_237), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_237, __p2_237)))); \
+  __ret_237 = __builtin_shufflevector(__ret_237, __ret_237, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_237; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s64(__p0_238, __p1_238, __p2_238) __extension__ ({ \
+  int32x2_t __s0_238 = __p0_238; \
+  int64x2_t __s1_238 = __p1_238; \
+  int32x4_t __ret_238; \
+  __ret_238 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_238), (int32x2_t)(vshrn_n_s64(__s1_238, __p2_238)))); \
+  __ret_238; \
+})
+#else
+#define vshrn_high_n_s64(__p0_239, __p1_239, __p2_239) __extension__ ({ \
+  int32x2_t __s0_239 = __p0_239; \
+  int64x2_t __s1_239 = __p1_239; \
+  int32x2_t __rev0_239;  __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 1, 0); \
+  int64x2_t __rev1_239;  __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 1, 0); \
+  int32x4_t __ret_239; \
+  __ret_239 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_239), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_239, __p2_239)))); \
+  __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 3, 2, 1, 0); \
+  __ret_239; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vshrn_high_n_s16(__p0_240, __p1_240, __p2_240) __extension__ ({ \
+  int8x8_t __s0_240 = __p0_240; \
+  int16x8_t __s1_240 = __p1_240; \
+  int8x16_t __ret_240; \
+  __ret_240 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_240), (int8x8_t)(vshrn_n_s16(__s1_240, __p2_240)))); \
+  __ret_240; \
+})
+#else
+#define vshrn_high_n_s16(__p0_241, __p1_241, __p2_241) __extension__ ({ \
+  int8x8_t __s0_241 = __p0_241; \
+  int16x8_t __s1_241 = __p1_241; \
+  int8x8_t __rev0_241;  __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16x8_t __rev1_241;  __rev1_241 = __builtin_shufflevector(__s1_241, __s1_241, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int8x16_t __ret_241; \
+  __ret_241 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_241), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_241, __p2_241)))); \
+  __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_241; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vslid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vslid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vslid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vslid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vslid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vslid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vslid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vslid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsli_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vsli_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsli_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsliq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vsliq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsliq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint8_t vsqaddb_u8(uint8_t __p0, uint8_t __p1) {
+  uint8_t __ret;
+  __ret = (uint8_t) __builtin_neon_vsqaddb_u8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint32_t vsqadds_u32(uint32_t __p0, uint32_t __p1) {
+  uint32_t __ret;
+  __ret = (uint32_t) __builtin_neon_vsqadds_u32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vsqaddd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsqaddd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint16_t vsqaddh_u16(uint16_t __p0, uint16_t __p1) {
+  uint16_t __ret;
+  __ret = (uint16_t) __builtin_neon_vsqaddh_u16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 48);
+  return __ret;
+}
+#else
+__ai uint8x16_t vsqaddq_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = (uint8x16_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 50);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsqaddq_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsqaddq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 49);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsqaddq_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t) __builtin_neon_vsqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 16);
+  return __ret;
+}
+#else
+__ai uint8x8_t vsqadd_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = (uint8x8_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 16);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 18);
+  return __ret;
+}
+#else
+__ai uint32x2_t vsqadd_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = (uint32x2_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 18);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vsqadd_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 17);
+  return __ret;
+}
+#else
+__ai uint16x4_t vsqadd_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = (uint16x4_t) __builtin_neon_vsqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vsqrtq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai float64x2_t vsqrtq_f64(float64x2_t __p0) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vsqrtq_f32(float32x4_t __p0) {
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 41);
+  return __ret;
+}
+#else
+__ai float32x4_t vsqrtq_f32(float32x4_t __p0) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = (float32x4_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 41);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vsqrt_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#else
+__ai float64x1_t vsqrt_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 10);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vsqrt_f32(float32x2_t __p0) {
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 9);
+  return __ret;
+}
+#else
+__ai float32x2_t vsqrt_f32(float32x2_t __p0) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __ret;
+  __ret = (float32x2_t) __builtin_neon_vsqrt_v((int8x8_t)__rev0, 9);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrad_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrad_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrad_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrad_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrid_n_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64_t __s0 = __p0; \
+  uint64_t __s1 = __p1; \
+  uint64_t __ret; \
+  __ret = (uint64_t) __builtin_neon_vsrid_n_u64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsrid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#else
+#define vsrid_n_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64_t __s0 = __p0; \
+  int64_t __s1 = __p1; \
+  int64_t __ret; \
+  __ret = (int64_t) __builtin_neon_vsrid_n_s64(__s0, __s1, __p2); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsri_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#else
+#define vsri_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s0 = __p0; \
+  poly64x1_t __s1 = __p1; \
+  poly64x1_t __ret; \
+  __ret = (poly64x1_t) __builtin_neon_vsri_n_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 6); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsriq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 38); \
+  __ret; \
+})
+#else
+#define vsriq_n_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s0 = __p0; \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  poly64x2_t __ret; \
+  __ret = (poly64x2_t) __builtin_neon_vsriq_n_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 38); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 6); \
+})
+#else
+#define vst1_p64(__p0, __p1) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 38); \
+})
+#else
+#define vst1q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 42); \
+})
+#else
+#define vst1q_f64(__p0, __p1) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 10); \
+})
+#else
+#define vst1_f64(__p0, __p1) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_v(__p0, (int8x8_t)__s1, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+})
+#else
+#define vst1_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 38); \
+})
+#else
+#define vst1q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2_t __s1 = __p1; \
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__s1, __p2, 42); \
+})
+#else
+#define vst1q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2_t __s1 = __p1; \
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  __builtin_neon_vst1q_lane_v(__p0, (int8x16_t)__rev1, __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+})
+#else
+#define vst1_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1_t __s1 = __p1; \
+  __builtin_neon_vst1_lane_v(__p0, (int8x8_t)__s1, __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \
+})
+#else
+#define vst1_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x8x2_t __s1 = __p1; \
+  poly8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#else
+#define vst1_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \
+})
+#else
+#define vst1_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x4x2_t __s1 = __p1; \
+  poly16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \
+})
+#else
+#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \
+})
+#else
+#define vst1q_p64_x2(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \
+})
+#else
+#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \
+  poly16x8x2_t __s1 = __p1; \
+  poly16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \
+})
+#else
+#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \
+})
+#else
+#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x4x2_t __s1 = __p1; \
+  uint32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
+})
+#else
+#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \
+})
+#else
+#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x8x2_t __s1 = __p1; \
+  uint16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \
+})
+#else
+#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 42); \
+})
+#else
+#define vst1q_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 41); \
+})
+#else
+#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x4x2_t __s1 = __p1; \
+  float32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 40); \
+})
+#else
+#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x8x2_t __s1 = __p1; \
+  float16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 34); \
+})
+#else
+#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x4x2_t __s1 = __p1; \
+  int32x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 35); \
+})
+#else
+#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 33); \
+})
+#else
+#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x8x2_t __s1 = __p1; \
+  int16x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \
+})
+#else
+#define vst1_u8_x2(__p0, __p1) __extension__ ({ \
+  uint8x8x2_t __s1 = __p1; \
+  uint8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \
+})
+#else
+#define vst1_u32_x2(__p0, __p1) __extension__ ({ \
+  uint32x2x2_t __s1 = __p1; \
+  uint32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#else
+#define vst1_u64_x2(__p0, __p1) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \
+})
+#else
+#define vst1_u16_x2(__p0, __p1) __extension__ ({ \
+  uint16x4x2_t __s1 = __p1; \
+  uint16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \
+})
+#else
+#define vst1_s8_x2(__p0, __p1) __extension__ ({ \
+  int8x8x2_t __s1 = __p1; \
+  int8x8x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#else
+#define vst1_f64_x2(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 9); \
+})
+#else
+#define vst1_f32_x2(__p0, __p1) __extension__ ({ \
+  float32x2x2_t __s1 = __p1; \
+  float32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 8); \
+})
+#else
+#define vst1_f16_x2(__p0, __p1) __extension__ ({ \
+  float16x4x2_t __s1 = __p1; \
+  float16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 2); \
+})
+#else
+#define vst1_s32_x2(__p0, __p1) __extension__ ({ \
+  int32x2x2_t __s1 = __p1; \
+  int32x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#else
+#define vst1_s64_x2(__p0, __p1) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 1); \
+})
+#else
+#define vst1_s16_x2(__p0, __p1) __extension__ ({ \
+  int16x4x2_t __s1 = __p1; \
+  int16x4x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \
+})
+#else
+#define vst1_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x8x3_t __s1 = __p1; \
+  poly8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#else
+#define vst1_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \
+})
+#else
+#define vst1_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x4x3_t __s1 = __p1; \
+  poly16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \
+})
+#else
+#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \
+})
+#else
+#define vst1q_p64_x3(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \
+})
+#else
+#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \
+  poly16x8x3_t __s1 = __p1; \
+  poly16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \
+})
+#else
+#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \
+})
+#else
+#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x4x3_t __s1 = __p1; \
+  uint32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
+})
+#else
+#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \
+})
+#else
+#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x8x3_t __s1 = __p1; \
+  uint16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \
+})
+#else
+#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 42); \
+})
+#else
+#define vst1q_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \
+})
+#else
+#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x4x3_t __s1 = __p1; \
+  float32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \
+})
+#else
+#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x8x3_t __s1 = __p1; \
+  float16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \
+})
+#else
+#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x4x3_t __s1 = __p1; \
+  int32x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \
+})
+#else
+#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \
+})
+#else
+#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x8x3_t __s1 = __p1; \
+  int16x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \
+})
+#else
+#define vst1_u8_x3(__p0, __p1) __extension__ ({ \
+  uint8x8x3_t __s1 = __p1; \
+  uint8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \
+})
+#else
+#define vst1_u32_x3(__p0, __p1) __extension__ ({ \
+  uint32x2x3_t __s1 = __p1; \
+  uint32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#else
+#define vst1_u64_x3(__p0, __p1) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \
+})
+#else
+#define vst1_u16_x3(__p0, __p1) __extension__ ({ \
+  uint16x4x3_t __s1 = __p1; \
+  uint16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \
+})
+#else
+#define vst1_s8_x3(__p0, __p1) __extension__ ({ \
+  int8x8x3_t __s1 = __p1; \
+  int8x8x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#else
+#define vst1_f64_x3(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \
+})
+#else
+#define vst1_f32_x3(__p0, __p1) __extension__ ({ \
+  float32x2x3_t __s1 = __p1; \
+  float32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \
+})
+#else
+#define vst1_f16_x3(__p0, __p1) __extension__ ({ \
+  float16x4x3_t __s1 = __p1; \
+  float16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \
+})
+#else
+#define vst1_s32_x3(__p0, __p1) __extension__ ({ \
+  int32x2x3_t __s1 = __p1; \
+  int32x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#else
+#define vst1_s64_x3(__p0, __p1) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \
+})
+#else
+#define vst1_s16_x3(__p0, __p1) __extension__ ({ \
+  int16x4x3_t __s1 = __p1; \
+  int16x4x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \
+})
+#else
+#define vst1_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x8x4_t __s1 = __p1; \
+  poly8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#else
+#define vst1_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \
+})
+#else
+#define vst1_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x4x4_t __s1 = __p1; \
+  poly16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \
+})
+#else
+#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \
+})
+#else
+#define vst1q_p64_x4(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \
+})
+#else
+#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \
+  poly16x8x4_t __s1 = __p1; \
+  poly16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \
+})
+#else
+#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \
+})
+#else
+#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x4x4_t __s1 = __p1; \
+  uint32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
+})
+#else
+#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \
+})
+#else
+#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x8x4_t __s1 = __p1; \
+  uint16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \
+})
+#else
+#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 42); \
+})
+#else
+#define vst1q_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \
+})
+#else
+#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x4x4_t __s1 = __p1; \
+  float32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \
+})
+#else
+#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x8x4_t __s1 = __p1; \
+  float16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \
+})
+#else
+#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x4x4_t __s1 = __p1; \
+  int32x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \
+})
+#else
+#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \
+})
+#else
+#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x8x4_t __s1 = __p1; \
+  int16x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \
+})
+#else
+#define vst1_u8_x4(__p0, __p1) __extension__ ({ \
+  uint8x8x4_t __s1 = __p1; \
+  uint8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \
+})
+#else
+#define vst1_u32_x4(__p0, __p1) __extension__ ({ \
+  uint32x2x4_t __s1 = __p1; \
+  uint32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#else
+#define vst1_u64_x4(__p0, __p1) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \
+})
+#else
+#define vst1_u16_x4(__p0, __p1) __extension__ ({ \
+  uint16x4x4_t __s1 = __p1; \
+  uint16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \
+})
+#else
+#define vst1_s8_x4(__p0, __p1) __extension__ ({ \
+  int8x8x4_t __s1 = __p1; \
+  int8x8x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#else
+#define vst1_f64_x4(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \
+})
+#else
+#define vst1_f32_x4(__p0, __p1) __extension__ ({ \
+  float32x2x4_t __s1 = __p1; \
+  float32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \
+})
+#else
+#define vst1_f16_x4(__p0, __p1) __extension__ ({ \
+  float16x4x4_t __s1 = __p1; \
+  float16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \
+})
+#else
+#define vst1_s32_x4(__p0, __p1) __extension__ ({ \
+  int32x2x4_t __s1 = __p1; \
+  int32x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#else
+#define vst1_s64_x4(__p0, __p1) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \
+})
+#else
+#define vst1_s16_x4(__p0, __p1) __extension__ ({ \
+  int16x4x4_t __s1 = __p1; \
+  int16x4x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \
+  __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#else
+#define vst2_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \
+})
+#else
+#define vst2q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \
+})
+#else
+#define vst2q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 42); \
+})
+#else
+#define vst2q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_v(__p0, __s1.val[0], __s1.val[1], 35); \
+})
+#else
+#define vst2q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_v(__p0, __rev1.val[0], __rev1.val[1], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_f64(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#else
+#define vst2_f64(__p0, __p1) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_v(__p0, __s1.val[0], __s1.val[1], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+})
+#else
+#define vst2_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 36); \
+})
+#else
+#define vst2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x2_t __s1 = __p1; \
+  poly8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 38); \
+})
+#else
+#define vst2q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x2_t __s1 = __p1; \
+  poly64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 48); \
+})
+#else
+#define vst2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x2_t __s1 = __p1; \
+  uint8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 51); \
+})
+#else
+#define vst2q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x2_t __s1 = __p1; \
+  uint64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], __p2, 32); \
+})
+#else
+#define vst2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x2_t __s1 = __p1; \
+  int8x16x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 42); \
+})
+#else
+#define vst2q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x2_t __s1 = __p1; \
+  float64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  __builtin_neon_vst2q_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 35); \
+})
+#else
+#define vst2q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x2_t __s1 = __p1; \
+  int64x2x2_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __builtin_neon_vst2q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+})
+#else
+#define vst2_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 10); \
+})
+#else
+#define vst2_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 3); \
+})
+#else
+#define vst2_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x2_t __s1 = __p1; \
+  __builtin_neon_vst2_lane_v(__p0, __s1.val[0], __s1.val[1], __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#else
+#define vst3_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \
+})
+#else
+#define vst3q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \
+})
+#else
+#define vst3q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 42); \
+})
+#else
+#define vst3q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \
+})
+#else
+#define vst3q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_f64(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#else
+#define vst3_f64(__p0, __p1) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+})
+#else
+#define vst3_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 36); \
+})
+#else
+#define vst3q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x3_t __s1 = __p1; \
+  poly8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 38); \
+})
+#else
+#define vst3q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x3_t __s1 = __p1; \
+  poly64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 48); \
+})
+#else
+#define vst3q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x3_t __s1 = __p1; \
+  uint8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 51); \
+})
+#else
+#define vst3q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x3_t __s1 = __p1; \
+  uint64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], __p2, 32); \
+})
+#else
+#define vst3q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x3_t __s1 = __p1; \
+  int8x16x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 42); \
+})
+#else
+#define vst3q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x3_t __s1 = __p1; \
+  float64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  __builtin_neon_vst3q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 35); \
+})
+#else
+#define vst3q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x3_t __s1 = __p1; \
+  int64x2x3_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __builtin_neon_vst3q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+})
+#else
+#define vst3_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+})
+#else
+#define vst3_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+})
+#else
+#define vst3_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x3_t __s1 = __p1; \
+  __builtin_neon_vst3_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#else
+#define vst4_p64(__p0, __p1) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \
+})
+#else
+#define vst4q_p64(__p0, __p1) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \
+})
+#else
+#define vst4q_u64(__p0, __p1) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 42); \
+})
+#else
+#define vst4q_f64(__p0, __p1) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \
+})
+#else
+#define vst4q_s64(__p0, __p1) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_f64(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#else
+#define vst4_f64(__p0, __p1) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+})
+#else
+#define vst4_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 6); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 36); \
+})
+#else
+#define vst4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \
+  poly8x16x4_t __s1 = __p1; \
+  poly8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 36); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 38); \
+})
+#else
+#define vst4q_lane_p64(__p0, __p1, __p2) __extension__ ({ \
+  poly64x2x4_t __s1 = __p1; \
+  poly64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 38); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 48); \
+})
+#else
+#define vst4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \
+  uint8x16x4_t __s1 = __p1; \
+  uint8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 48); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 51); \
+})
+#else
+#define vst4q_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x2x4_t __s1 = __p1; \
+  uint64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 51); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], __p2, 32); \
+})
+#else
+#define vst4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \
+  int8x16x4_t __s1 = __p1; \
+  int8x16x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], __p2, 32); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 42); \
+})
+#else
+#define vst4q_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x2x4_t __s1 = __p1; \
+  float64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 42); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  __builtin_neon_vst4q_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 35); \
+})
+#else
+#define vst4q_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x2x4_t __s1 = __p1; \
+  int64x2x4_t __rev1; \
+  __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \
+  __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \
+  __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \
+  __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \
+  __builtin_neon_vst4q_lane_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], __p2, 35); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+})
+#else
+#define vst4_lane_u64(__p0, __p1, __p2) __extension__ ({ \
+  uint64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], __p2, 19); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+})
+#else
+#define vst4_lane_f64(__p0, __p1, __p2) __extension__ ({ \
+  float64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 10); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vst4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+})
+#else
+#define vst4_lane_s64(__p0, __p1, __p2) __extension__ ({ \
+  int64x1x4_t __s1 = __p1; \
+  __builtin_neon_vst4_lane_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], __p2, 3); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vstrq_p128(__p0, __p1) __extension__ ({ \
+  poly128_t __s1 = __p1; \
+  __builtin_neon_vstrq_p128(__p0, __s1); \
+})
+#else
+#define vstrq_p128(__p0, __p1) __extension__ ({ \
+  poly128_t __s1 = __p1; \
+  __builtin_neon_vstrq_p128(__p0, __s1); \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsubd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vsubd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vsubd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vsubd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vsubd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vsubd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vsubq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float64x2_t vsubq_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __rev0 - __rev1;
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x1_t vsub_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#else
+__ai float64x1_t vsub_f64(float64x1_t __p0, float64x1_t __p1) {
+  float64x1_t __ret;
+  __ret = __p0 - __p1;
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x8_t __ret;
+  __ret = vcombine_u16(__p0, vsubhn_u32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubhn_high_u32(uint16x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vcombine_u16(__rev0, __noswap_vsubhn_u32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x4_t __ret;
+  __ret = vcombine_u32(__p0, vsubhn_u64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubhn_high_u64(uint32x2_t __p0, uint64x2_t __p1, uint64x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vcombine_u32(__rev0, __noswap_vsubhn_u64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x16_t __ret;
+  __ret = vcombine_u8(__p0, vsubhn_u16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai uint8x16_t vsubhn_high_u16(uint8x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __noswap_vcombine_u8(__rev0, __noswap_vsubhn_u16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x8_t __ret;
+  __ret = vcombine_s16(__p0, vsubhn_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vsubhn_high_s32(int16x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vcombine_s16(__rev0, __noswap_vsubhn_s32(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x4_t __ret;
+  __ret = vcombine_s32(__p0, vsubhn_s64(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vsubhn_high_s64(int32x2_t __p0, int64x2_t __p1, int64x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vcombine_s32(__rev0, __noswap_vsubhn_s64(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x16_t __ret;
+  __ret = vcombine_s8(__p0, vsubhn_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int8x16_t vsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __noswap_vcombine_s8(__rev0, __noswap_vsubhn_s16(__rev1, __rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_high_u8(__p0) - vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_high_u8(__rev0) - __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_high_u32(__p0) - vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_high_u32(__rev0) - __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_high_u16(__p0) - vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_high_u16(__rev0) - __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_high_s8(__p0) - vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_high_s8(__rev0) - __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_high_s32(__p0) - vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_high_s32(__rev0) - __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_high_s16(__p0) - vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_high_s16(__rev0) - __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vsubw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 - vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vsubw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vsubw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vsubw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vsubw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vsubw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vsubw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 - vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vsubw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vsubw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 - vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vsubw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vsubw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 - vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vsubw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtrn1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtrn1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vtrn1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai poly16x4_t vtrn1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vtrn1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  return __ret;
+}
+#else
+__ai poly8x16_t vtrn1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vtrn1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai poly64x2_t vtrn1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vtrn1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai poly16x8_t vtrn1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtrn1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtrn1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtrn1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtrn1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtrn1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtrn1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtrn1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtrn1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vtrn1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  return __ret;
+}
+#else
+__ai int8x16_t vtrn1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vtrn1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float64x2_t vtrn1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vtrn1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai float32x4_t vtrn1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vtrn1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai int32x4_t vtrn1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vtrn1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int64x2_t vtrn1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vtrn1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai int16x8_t vtrn1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtrn1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtrn1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtrn1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtrn1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtrn1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtrn1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtrn1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14);
+  return __ret;
+}
+#else
+__ai int8x8_t vtrn1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vtrn1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vtrn1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vtrn1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vtrn1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vtrn1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6);
+  return __ret;
+}
+#else
+__ai int16x4_t vtrn1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vtrn2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vtrn2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vtrn2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vtrn2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vtrn2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  return __ret;
+}
+#else
+__ai poly8x16_t vtrn2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vtrn2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai poly64x2_t vtrn2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vtrn2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai poly16x8_t vtrn2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vtrn2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  return __ret;
+}
+#else
+__ai uint8x16_t vtrn2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vtrn2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai uint32x4_t vtrn2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtrn2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtrn2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vtrn2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai uint16x8_t vtrn2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vtrn2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  return __ret;
+}
+#else
+__ai int8x16_t vtrn2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vtrn2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float64x2_t vtrn2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vtrn2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai float32x4_t vtrn2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vtrn2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai int32x4_t vtrn2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vtrn2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int64x2_t vtrn2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vtrn2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai int16x8_t vtrn2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vtrn2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vtrn2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vtrn2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vtrn2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vtrn2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vtrn2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vtrn2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vtrn2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vtrn2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vtrn2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vtrn2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vtrn2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vtrn2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vtrn2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vtst_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vtst_p64(poly64x1_t __p0, poly64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtstq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtstq_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtstq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtstq_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vtstq_s64(int64x2_t __p0, int64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__p0, (int8x16_t)__p1, 51);
+  return __ret;
+}
+#else
+__ai uint64x2_t vtstq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t) __builtin_neon_vtstq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 51);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vtst_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vtst_u64(uint64x1_t __p0, uint64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x1_t vtst_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#else
+__ai uint64x1_t vtst_s64(int64x1_t __p0, int64x1_t __p1) {
+  uint64x1_t __ret;
+  __ret = (uint64x1_t) __builtin_neon_vtst_v((int8x8_t)__p0, (int8x8_t)__p1, 19);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) {
+  uint64_t __ret;
+  __ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vtstd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vtstd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int8_t vuqaddb_s8(int8_t __p0, int8_t __p1) {
+  int8_t __ret;
+  __ret = (int8_t) __builtin_neon_vuqaddb_s8(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int32_t vuqadds_s32(int32_t __p0, int32_t __p1) {
+  int32_t __ret;
+  __ret = (int32_t) __builtin_neon_vuqadds_s32(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int64_t vuqaddd_s64(int64_t __p0, int64_t __p1) {
+  int64_t __ret;
+  __ret = (int64_t) __builtin_neon_vuqaddd_s64(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#else
+__ai int16_t vuqaddh_s16(int16_t __p0, int16_t __p1) {
+  int16_t __ret;
+  __ret = (int16_t) __builtin_neon_vuqaddh_s16(__p0, __p1);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 32);
+  return __ret;
+}
+#else
+__ai int8x16_t vuqaddq_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = (int8x16_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 32);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 34);
+  return __ret;
+}
+#else
+__ai int32x4_t vuqaddq_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 34);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 35);
+  return __ret;
+}
+#else
+__ai int64x2_t vuqaddq_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 35);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 33);
+  return __ret;
+}
+#else
+__ai int16x8_t vuqaddq_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t) __builtin_neon_vuqaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 33);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 0);
+  return __ret;
+}
+#else
+__ai int8x8_t vuqadd_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = (int8x8_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 0);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vuqadd_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = (int32x2_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#else
+__ai int64x1_t vuqadd_s64(int64x1_t __p0, int64x1_t __p1) {
+  int64x1_t __ret;
+  __ret = (int64x1_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 3);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__p0, (int8x8_t)__p1, 1);
+  return __ret;
+}
+#else
+__ai int16x4_t vuqadd_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = (int16x4_t) __builtin_neon_vuqadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vuzp1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai poly8x8_t vuzp1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vuzp1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai poly16x4_t vuzp1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vuzp1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  return __ret;
+}
+#else
+__ai poly8x16_t vuzp1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vuzp1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai poly64x2_t vuzp1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vuzp1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai poly16x8_t vuzp1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vuzp1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  return __ret;
+}
+#else
+__ai uint8x16_t vuzp1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vuzp1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai uint32x4_t vuzp1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vuzp1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vuzp1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vuzp1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai uint16x8_t vuzp1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vuzp1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  return __ret;
+}
+#else
+__ai int8x16_t vuzp1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vuzp1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float64x2_t vuzp1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vuzp1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai float32x4_t vuzp1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vuzp1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai int32x4_t vuzp1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vuzp1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int64x2_t vuzp1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vuzp1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai int16x8_t vuzp1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vuzp1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai uint8x8_t vuzp1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vuzp1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vuzp1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vuzp1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai uint16x4_t vuzp1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14);
+  return __ret;
+}
+#else
+__ai int8x8_t vuzp1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vuzp1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vuzp1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vuzp1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vuzp1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vuzp1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6);
+  return __ret;
+}
+#else
+__ai int16x4_t vuzp1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vuzp2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vuzp2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vuzp2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vuzp2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vuzp2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  return __ret;
+}
+#else
+__ai poly8x16_t vuzp2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vuzp2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai poly64x2_t vuzp2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vuzp2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai poly16x8_t vuzp2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vuzp2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  return __ret;
+}
+#else
+__ai uint8x16_t vuzp2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vuzp2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai uint32x4_t vuzp2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vuzp2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint64x2_t vuzp2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vuzp2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai uint16x8_t vuzp2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vuzp2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  return __ret;
+}
+#else
+__ai int8x16_t vuzp2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vuzp2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float64x2_t vuzp2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vuzp2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai float32x4_t vuzp2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vuzp2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai int32x4_t vuzp2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vuzp2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int64x2_t vuzp2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vuzp2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai int16x8_t vuzp2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vuzp2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vuzp2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vuzp2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vuzp2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vuzp2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vuzp2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vuzp2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vuzp2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vuzp2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vuzp2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vuzp2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vuzp2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vuzp2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vuzp2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vzip1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai poly8x8_t vzip1_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vzip1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai poly16x4_t vzip1_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vzip1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  return __ret;
+}
+#else
+__ai poly8x16_t vzip1q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vzip1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai poly64x2_t vzip1q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vzip1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai poly16x8_t vzip1q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vzip1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  return __ret;
+}
+#else
+__ai uint8x16_t vzip1q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vzip1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai uint32x4_t vzip1q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vzip1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vzip1q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vzip1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai uint16x8_t vzip1q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vzip1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  return __ret;
+}
+#else
+__ai int8x16_t vzip1q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vzip1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float64x2_t vzip1q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vzip1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai float32x4_t vzip1q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vzip1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai int32x4_t vzip1q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vzip1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int64x2_t vzip1q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vzip1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai int16x8_t vzip1q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vzip1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai uint8x8_t vzip1_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vzip1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vzip1_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vzip1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai uint16x4_t vzip1_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vzip1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11);
+  return __ret;
+}
+#else
+__ai int8x8_t vzip1_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vzip1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai float32x2_t vzip1_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vzip1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 2);
+  return __ret;
+}
+#else
+__ai int32x2_t vzip1_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vzip1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5);
+  return __ret;
+}
+#else
+__ai int16x4_t vzip1_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x8_t vzip2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai poly8x8_t vzip2_p8(poly8x8_t __p0, poly8x8_t __p1) {
+  poly8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x4_t vzip2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai poly16x4_t vzip2_p16(poly16x4_t __p0, poly16x4_t __p1) {
+  poly16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  poly16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  poly16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly8x16_t vzip2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  return __ret;
+}
+#else
+__ai poly8x16_t vzip2q_p8(poly8x16_t __p0, poly8x16_t __p1) {
+  poly8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly64x2_t vzip2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai poly64x2_t vzip2q_p64(poly64x2_t __p0, poly64x2_t __p1) {
+  poly64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  poly64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  poly64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai poly16x8_t vzip2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai poly16x8_t vzip2q_p16(poly16x8_t __p0, poly16x8_t __p1) {
+  poly16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  poly16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vzip2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  return __ret;
+}
+#else
+__ai uint8x16_t vzip2q_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vzip2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai uint32x4_t vzip2q_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vzip2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint64x2_t vzip2q_u64(uint64x2_t __p0, uint64x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vzip2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai uint16x8_t vzip2q_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vzip2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  return __ret;
+}
+#else
+__ai int8x16_t vzip2q_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float64x2_t vzip2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float64x2_t vzip2q_f64(float64x2_t __p0, float64x2_t __p1) {
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x4_t vzip2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai float32x4_t vzip2q_f32(float32x4_t __p0, float32x4_t __p1) {
+  float32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  float32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  float32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vzip2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai int32x4_t vzip2q_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vzip2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int64x2_t vzip2q_s64(int64x2_t __p0, int64x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int64x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vzip2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai int16x8_t vzip2q_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vzip2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai uint8x8_t vzip2_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vzip2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai uint32x2_t vzip2_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vzip2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai uint16x4_t vzip2_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vzip2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15);
+  return __ret;
+}
+#else
+__ai int8x8_t vzip2_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai float32x2_t vzip2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai float32x2_t vzip2_f32(float32x2_t __p0, float32x2_t __p1) {
+  float32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  float32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  float32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vzip2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 1, 3);
+  return __ret;
+}
+#else
+__ai int32x2_t vzip2_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vzip2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7);
+  return __ret;
+}
+#else
+__ai int16x4_t vzip2_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x16_t vabaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __ret;
+  __ret = __p0 + vabdq_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint8x16_t vabaq_u8(uint8x16_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __ret;
+  __ret = __rev0 + __noswap_vabdq_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vabdq_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vabaq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdq_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + vabdq_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vabaq_u16(uint16x8_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdq_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x16_t vabaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __ret;
+  __ret = __p0 + vabdq_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int8x16_t vabaq_s8(int8x16_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __ret;
+  __ret = __rev0 + __noswap_vabdq_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vabdq_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vabaq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdq_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + vabdq_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vabaq_s16(int16x8_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdq_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint8x8_t vaba_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __ret;
+  __ret = __p0 + vabd_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint8x8_t vaba_u8(uint8x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __ret;
+  __ret = __rev0 + __noswap_vabd_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x2_t vaba_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __ret;
+  __ret = __p0 + vabd_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x2_t vaba_u32(uint32x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint32x2_t __ret;
+  __ret = __rev0 + __noswap_vabd_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x4_t vaba_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __ret;
+  __ret = __p0 + vabd_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x4_t vaba_u16(uint16x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint16x4_t __ret;
+  __ret = __rev0 + __noswap_vabd_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int8x8_t vaba_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __ret;
+  __ret = __p0 + vabd_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int8x8_t vaba_s8(int8x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __ret;
+  __ret = __rev0 + __noswap_vabd_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x2_t vaba_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __ret;
+  __ret = __p0 + vabd_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x2_t vaba_s32(int32x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int32x2_t __ret;
+  __ret = __rev0 + __noswap_vabd_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x4_t vaba_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __ret;
+  __ret = __p0 + vabd_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x4_t vaba_s16(int16x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int16x4_t __ret;
+  __ret = __rev0 + __noswap_vabd_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(vmovl_u8((uint8x8_t)(vabd_u8(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai uint16x8_t vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_u8(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vabdl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = (uint16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_u8(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(vmovl_u32((uint32x2_t)(vabd_u32(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai uint64x2_t vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_u32(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vabdl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = (uint64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_u32(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(vmovl_u16((uint16x4_t)(vabd_u16(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai uint32x4_t vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_u16(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vabdl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = (uint32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_u16(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(vmovl_u8((uint8x8_t)(vabd_s8(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai int16x8_t vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_s8(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vabdl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = (int16x8_t)(__noswap_vmovl_u8((uint8x8_t)(__noswap_vabd_s8(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(vmovl_u32((uint32x2_t)(vabd_s32(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai int64x2_t vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_s32(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vabdl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = (int64x2_t)(__noswap_vmovl_u32((uint32x2_t)(__noswap_vabd_s32(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(vmovl_u16((uint16x4_t)(vabd_s16(__p0, __p1))));
+  return __ret;
+}
+#else
+__ai int32x4_t vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_s16(__rev0, __rev1))));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vabdl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = (int32x4_t)(__noswap_vmovl_u16((uint16x4_t)(__noswap_vabd_s16(__p0, __p1))));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_u8(__p0) + vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddl_u8(uint8x8_t __p0, uint8x8_t __p1) {
+  uint8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_u8(__rev0) + __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_u32(__p0) + vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddl_u32(uint32x2_t __p0, uint32x2_t __p1) {
+  uint32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_u32(__rev0) + __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_u16(__p0) + vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddl_u16(uint16x4_t __p0, uint16x4_t __p1) {
+  uint16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_u16(__rev0) + __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_s8(__p0) + vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddl_s8(int8x8_t __p0, int8x8_t __p1) {
+  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_s8(__rev0) + __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_s32(__p0) + vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddl_s32(int32x2_t __p0, int32x2_t __p1) {
+  int32x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_s32(__rev0) + __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_s16(__p0) + vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddl_s16(int16x4_t __p0, int16x4_t __p1) {
+  int16x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_s16(__rev0) + __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 + vmovl_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddw_u8(uint16x8_t __p0, uint8x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmovl_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddw_u32(uint64x2_t __p0, uint32x2_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmovl_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddw_u16(uint32x4_t __p0, uint16x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 + vmovl_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddw_s8(int16x8_t __p0, int8x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 + vmovl_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddw_s32(int64x2_t __p0, int32x2_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 + vmovl_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vget_lane_f16(__p0_242, __p1_242) __extension__ ({ \
+  float16x4_t __s0_242 = __p0_242; \
+  float16_t __ret_242; \
+float16x4_t __reint_242 = __s0_242; \
+int16_t __reint1_242 = vget_lane_s16(*(int16x4_t *) &__reint_242, __p1_242); \
+  __ret_242 = *(float16_t *) &__reint1_242; \
+  __ret_242; \
+})
+#else
+#define vget_lane_f16(__p0_243, __p1_243) __extension__ ({ \
+  float16x4_t __s0_243 = __p0_243; \
+  float16x4_t __rev0_243;  __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \
+  float16_t __ret_243; \
+float16x4_t __reint_243 = __rev0_243; \
+int16_t __reint1_243 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_243, __p1_243); \
+  __ret_243 = *(float16_t *) &__reint1_243; \
+  __ret_243; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vgetq_lane_f16(__p0_244, __p1_244) __extension__ ({ \
+  float16x8_t __s0_244 = __p0_244; \
+  float16_t __ret_244; \
+float16x8_t __reint_244 = __s0_244; \
+int16_t __reint1_244 = vgetq_lane_s16(*(int16x8_t *) &__reint_244, __p1_244); \
+  __ret_244 = *(float16_t *) &__reint1_244; \
+  __ret_244; \
+})
+#else
+#define vgetq_lane_f16(__p0_245, __p1_245) __extension__ ({ \
+  float16x8_t __s0_245 = __p0_245; \
+  float16x8_t __rev0_245;  __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16_t __ret_245; \
+float16x8_t __reint_245 = __rev0_245; \
+int16_t __reint1_245 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_245, __p1_245); \
+  __ret_245 = *(float16_t *) &__reint1_245; \
+  __ret_245; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + vmull_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vmull_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmlal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __noswap_vmull_u8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmull_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_u32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmull_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_u16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + vmull_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vmull_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmlal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __noswap_vmull_s8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + vmull_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_s32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vmull_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_s16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 + vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 + vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 + vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 + vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlal_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_u32(__rev1, (uint32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlal_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_u16(__rev1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlal_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmull_s32(__rev1, (int32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlal_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + __noswap_vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmull_s16(__rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlal_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __noswap_vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - vmull_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 - __noswap_vmull_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vmlsl_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 - __noswap_vmull_u8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmull_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlsl_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_u32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmull_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlsl_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_u16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - vmull_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 - __noswap_vmull_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vmlsl_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 - __noswap_vmull_s8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - vmull_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlsl_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_s32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - vmull_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlsl_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_s16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __ret; \
+  __ret = __s0 - vmull_u32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint64x2_t __s0 = __p0; \
+  uint32x2_t __s1 = __p1; \
+  uint32x2_t __s2 = __p2; \
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  uint64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __ret; \
+  __ret = __s0 - vmull_u16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_u16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  uint32x4_t __s0 = __p0; \
+  uint16x4_t __s1 = __p1; \
+  uint16x4_t __s2 = __p2; \
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  uint32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __ret; \
+  __ret = __s0 - vmull_s32(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int64x2_t __s0 = __p0; \
+  int32x2_t __s1 = __p1; \
+  int32x2_t __s2 = __p2; \
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 1, 0); \
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 1, 0); \
+  int64x2_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __ret; \
+  __ret = __s0 - vmull_s16(__s1, __builtin_shufflevector(__s2, __s2, __p3, __p3, __p3, __p3)); \
+  __ret; \
+})
+#else
+#define vmlsl_lane_s16(__p0, __p1, __p2, __p3) __extension__ ({ \
+  int32x4_t __s0 = __p0; \
+  int16x4_t __s1 = __p1; \
+  int16x4_t __s2 = __p2; \
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \
+  int32x4_t __ret; \
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, __builtin_shufflevector(__rev2, __rev2, __p3, __p3, __p3, __p3)); \
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
+  __ret; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_u32(__rev1, (uint32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vmlsl_n_u32(uint64x2_t __p0, uint32x2_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_u32(__p1, (uint32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_u16(__rev1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vmlsl_n_u16(uint32x4_t __p0, uint16x4_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_u16(__p1, (uint16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 - __noswap_vmull_s32(__rev1, (int32x2_t) {__p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vmlsl_n_s32(int64x2_t __p0, int32x2_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 - __noswap_vmull_s32(__p1, (int32x2_t) {__p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 - __noswap_vmull_s16(__rev1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 - __noswap_vmull_s16(__p1, (int16x4_t) {__p2, __p2, __p2, __p2});
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vset_lane_f16(__p0_246, __p1_246, __p2_246) __extension__ ({ \
+  float16_t __s0_246 = __p0_246; \
+  float16x4_t __s1_246 = __p1_246; \
+  float16x4_t __ret_246; \
+float16_t __reint_246 = __s0_246; \
+float16x4_t __reint1_246 = __s1_246; \
+int16x4_t __reint2_246 = vset_lane_s16(*(int16_t *) &__reint_246, *(int16x4_t *) &__reint1_246, __p2_246); \
+  __ret_246 = *(float16x4_t *) &__reint2_246; \
+  __ret_246; \
+})
+#else
+#define vset_lane_f16(__p0_247, __p1_247, __p2_247) __extension__ ({ \
+  float16_t __s0_247 = __p0_247; \
+  float16x4_t __s1_247 = __p1_247; \
+  float16x4_t __rev1_247;  __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 3, 2, 1, 0); \
+  float16x4_t __ret_247; \
+float16_t __reint_247 = __s0_247; \
+float16x4_t __reint1_247 = __rev1_247; \
+int16x4_t __reint2_247 = __noswap_vset_lane_s16(*(int16_t *) &__reint_247, *(int16x4_t *) &__reint1_247, __p2_247); \
+  __ret_247 = *(float16x4_t *) &__reint2_247; \
+  __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 3, 2, 1, 0); \
+  __ret_247; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vsetq_lane_f16(__p0_248, __p1_248, __p2_248) __extension__ ({ \
+  float16_t __s0_248 = __p0_248; \
+  float16x8_t __s1_248 = __p1_248; \
+  float16x8_t __ret_248; \
+float16_t __reint_248 = __s0_248; \
+float16x8_t __reint1_248 = __s1_248; \
+int16x8_t __reint2_248 = vsetq_lane_s16(*(int16_t *) &__reint_248, *(int16x8_t *) &__reint1_248, __p2_248); \
+  __ret_248 = *(float16x8_t *) &__reint2_248; \
+  __ret_248; \
+})
+#else
+#define vsetq_lane_f16(__p0_249, __p1_249, __p2_249) __extension__ ({ \
+  float16_t __s0_249 = __p0_249; \
+  float16x8_t __s1_249 = __p1_249; \
+  float16x8_t __rev1_249;  __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 7, 6, 5, 4, 3, 2, 1, 0); \
+  float16x8_t __ret_249; \
+float16_t __reint_249 = __s0_249; \
+float16x8_t __reint1_249 = __rev1_249; \
+int16x8_t __reint2_249 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_249, *(int16x8_t *) &__reint1_249, __p2_249); \
+  __ret_249 = *(float16x8_t *) &__reint2_249; \
+  __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \
+  __ret_249; \
+})
+#endif
+
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = vqadds_s32(__p0, vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = __noswap_vqadds_s32(__p0, __noswap_vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = vqaddh_s16(__p0, vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = __noswap_vqaddh_s16(__p0, __noswap_vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahs_lane_s32(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \
+  int32_t __s0_250 = __p0_250; \
+  int32_t __s1_250 = __p1_250; \
+  int32x2_t __s2_250 = __p2_250; \
+  int32_t __ret_250; \
+  __ret_250 = vqadds_s32(__s0_250, vqrdmulhs_s32(__s1_250, vget_lane_s32(__s2_250, __p3_250))); \
+  __ret_250; \
+})
+#else
+#define vqrdmlahs_lane_s32(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \
+  int32_t __s0_251 = __p0_251; \
+  int32_t __s1_251 = __p1_251; \
+  int32x2_t __s2_251 = __p2_251; \
+  int32x2_t __rev2_251;  __rev2_251 = __builtin_shufflevector(__s2_251, __s2_251, 1, 0); \
+  int32_t __ret_251; \
+  __ret_251 = __noswap_vqadds_s32(__s0_251, __noswap_vqrdmulhs_s32(__s1_251, __noswap_vget_lane_s32(__rev2_251, __p3_251))); \
+  __ret_251; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahh_lane_s16(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \
+  int16_t __s0_252 = __p0_252; \
+  int16_t __s1_252 = __p1_252; \
+  int16x4_t __s2_252 = __p2_252; \
+  int16_t __ret_252; \
+  __ret_252 = vqaddh_s16(__s0_252, vqrdmulhh_s16(__s1_252, vget_lane_s16(__s2_252, __p3_252))); \
+  __ret_252; \
+})
+#else
+#define vqrdmlahh_lane_s16(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \
+  int16_t __s0_253 = __p0_253; \
+  int16_t __s1_253 = __p1_253; \
+  int16x4_t __s2_253 = __p2_253; \
+  int16x4_t __rev2_253;  __rev2_253 = __builtin_shufflevector(__s2_253, __s2_253, 3, 2, 1, 0); \
+  int16_t __ret_253; \
+  __ret_253 = __noswap_vqaddh_s16(__s0_253, __noswap_vqrdmulhh_s16(__s1_253, __noswap_vget_lane_s16(__rev2_253, __p3_253))); \
+  __ret_253; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahs_laneq_s32(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \
+  int32_t __s0_254 = __p0_254; \
+  int32_t __s1_254 = __p1_254; \
+  int32x4_t __s2_254 = __p2_254; \
+  int32_t __ret_254; \
+  __ret_254 = vqadds_s32(__s0_254, vqrdmulhs_s32(__s1_254, vgetq_lane_s32(__s2_254, __p3_254))); \
+  __ret_254; \
+})
+#else
+#define vqrdmlahs_laneq_s32(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \
+  int32_t __s0_255 = __p0_255; \
+  int32_t __s1_255 = __p1_255; \
+  int32x4_t __s2_255 = __p2_255; \
+  int32x4_t __rev2_255;  __rev2_255 = __builtin_shufflevector(__s2_255, __s2_255, 3, 2, 1, 0); \
+  int32_t __ret_255; \
+  __ret_255 = __noswap_vqadds_s32(__s0_255, __noswap_vqrdmulhs_s32(__s1_255, __noswap_vgetq_lane_s32(__rev2_255, __p3_255))); \
+  __ret_255; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlahh_laneq_s16(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \
+  int16_t __s0_256 = __p0_256; \
+  int16_t __s1_256 = __p1_256; \
+  int16x8_t __s2_256 = __p2_256; \
+  int16_t __ret_256; \
+  __ret_256 = vqaddh_s16(__s0_256, vqrdmulhh_s16(__s1_256, vgetq_lane_s16(__s2_256, __p3_256))); \
+  __ret_256; \
+})
+#else
+#define vqrdmlahh_laneq_s16(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \
+  int16_t __s0_257 = __p0_257; \
+  int16_t __s1_257 = __p1_257; \
+  int16x8_t __s2_257 = __p2_257; \
+  int16x8_t __rev2_257;  __rev2_257 = __builtin_shufflevector(__s2_257, __s2_257, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_257; \
+  __ret_257 = __noswap_vqaddh_s16(__s0_257, __noswap_vqrdmulhh_s16(__s1_257, __noswap_vgetq_lane_s16(__rev2_257, __p3_257))); \
+  __ret_257; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32_t vqrdmlshs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = vqsubs_s32(__p0, vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int32_t vqrdmlshs_s32(int32_t __p0, int32_t __p1, int32_t __p2) {
+  int32_t __ret;
+  __ret = __noswap_vqsubs_s32(__p0, __noswap_vqrdmulhs_s32(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = vqsubh_s16(__p0, vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#else
+__ai int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) {
+  int16_t __ret;
+  __ret = __noswap_vqsubh_s16(__p0, __noswap_vqrdmulhh_s16(__p1, __p2));
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshs_lane_s32(__p0_258, __p1_258, __p2_258, __p3_258) __extension__ ({ \
+  int32_t __s0_258 = __p0_258; \
+  int32_t __s1_258 = __p1_258; \
+  int32x2_t __s2_258 = __p2_258; \
+  int32_t __ret_258; \
+  __ret_258 = vqsubs_s32(__s0_258, vqrdmulhs_s32(__s1_258, vget_lane_s32(__s2_258, __p3_258))); \
+  __ret_258; \
+})
+#else
+#define vqrdmlshs_lane_s32(__p0_259, __p1_259, __p2_259, __p3_259) __extension__ ({ \
+  int32_t __s0_259 = __p0_259; \
+  int32_t __s1_259 = __p1_259; \
+  int32x2_t __s2_259 = __p2_259; \
+  int32x2_t __rev2_259;  __rev2_259 = __builtin_shufflevector(__s2_259, __s2_259, 1, 0); \
+  int32_t __ret_259; \
+  __ret_259 = __noswap_vqsubs_s32(__s0_259, __noswap_vqrdmulhs_s32(__s1_259, __noswap_vget_lane_s32(__rev2_259, __p3_259))); \
+  __ret_259; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshh_lane_s16(__p0_260, __p1_260, __p2_260, __p3_260) __extension__ ({ \
+  int16_t __s0_260 = __p0_260; \
+  int16_t __s1_260 = __p1_260; \
+  int16x4_t __s2_260 = __p2_260; \
+  int16_t __ret_260; \
+  __ret_260 = vqsubh_s16(__s0_260, vqrdmulhh_s16(__s1_260, vget_lane_s16(__s2_260, __p3_260))); \
+  __ret_260; \
+})
+#else
+#define vqrdmlshh_lane_s16(__p0_261, __p1_261, __p2_261, __p3_261) __extension__ ({ \
+  int16_t __s0_261 = __p0_261; \
+  int16_t __s1_261 = __p1_261; \
+  int16x4_t __s2_261 = __p2_261; \
+  int16x4_t __rev2_261;  __rev2_261 = __builtin_shufflevector(__s2_261, __s2_261, 3, 2, 1, 0); \
+  int16_t __ret_261; \
+  __ret_261 = __noswap_vqsubh_s16(__s0_261, __noswap_vqrdmulhh_s16(__s1_261, __noswap_vget_lane_s16(__rev2_261, __p3_261))); \
+  __ret_261; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshs_laneq_s32(__p0_262, __p1_262, __p2_262, __p3_262) __extension__ ({ \
+  int32_t __s0_262 = __p0_262; \
+  int32_t __s1_262 = __p1_262; \
+  int32x4_t __s2_262 = __p2_262; \
+  int32_t __ret_262; \
+  __ret_262 = vqsubs_s32(__s0_262, vqrdmulhs_s32(__s1_262, vgetq_lane_s32(__s2_262, __p3_262))); \
+  __ret_262; \
+})
+#else
+#define vqrdmlshs_laneq_s32(__p0_263, __p1_263, __p2_263, __p3_263) __extension__ ({ \
+  int32_t __s0_263 = __p0_263; \
+  int32_t __s1_263 = __p1_263; \
+  int32x4_t __s2_263 = __p2_263; \
+  int32x4_t __rev2_263;  __rev2_263 = __builtin_shufflevector(__s2_263, __s2_263, 3, 2, 1, 0); \
+  int32_t __ret_263; \
+  __ret_263 = __noswap_vqsubs_s32(__s0_263, __noswap_vqrdmulhs_s32(__s1_263, __noswap_vgetq_lane_s32(__rev2_263, __p3_263))); \
+  __ret_263; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vqrdmlshh_laneq_s16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \
+  int16_t __s0_264 = __p0_264; \
+  int16_t __s1_264 = __p1_264; \
+  int16x8_t __s2_264 = __p2_264; \
+  int16_t __ret_264; \
+  __ret_264 = vqsubh_s16(__s0_264, vqrdmulhh_s16(__s1_264, vgetq_lane_s16(__s2_264, __p3_264))); \
+  __ret_264; \
+})
+#else
+#define vqrdmlshh_laneq_s16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \
+  int16_t __s0_265 = __p0_265; \
+  int16_t __s1_265 = __p1_265; \
+  int16x8_t __s2_265 = __p2_265; \
+  int16x8_t __rev2_265;  __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 7, 6, 5, 4, 3, 2, 1, 0); \
+  int16_t __ret_265; \
+  __ret_265 = __noswap_vqsubh_s16(__s0_265, __noswap_vqrdmulhh_s16(__s1_265, __noswap_vgetq_lane_s16(__rev2_265, __p3_265))); \
+  __ret_265; \
+})
+#endif
+
+#endif
+#if defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabdl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vabdl_u8(vget_high_u8(__p0), vget_high_u8(__p1));
+  return __ret;
+}
+#else
+__ai uint16x8_t vabdl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vabdl_u8(__noswap_vget_high_u8(__rev0), __noswap_vget_high_u8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabdl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vabdl_u32(vget_high_u32(__p0), vget_high_u32(__p1));
+  return __ret;
+}
+#else
+__ai uint64x2_t vabdl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vabdl_u32(__noswap_vget_high_u32(__rev0), __noswap_vget_high_u32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabdl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vabdl_u16(vget_high_u16(__p0), vget_high_u16(__p1));
+  return __ret;
+}
+#else
+__ai uint32x4_t vabdl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vabdl_u16(__noswap_vget_high_u16(__rev0), __noswap_vget_high_u16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabdl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vabdl_s8(vget_high_s8(__p0), vget_high_s8(__p1));
+  return __ret;
+}
+#else
+__ai int16x8_t vabdl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vabdl_s8(__noswap_vget_high_s8(__rev0), __noswap_vget_high_s8(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabdl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vabdl_s32(vget_high_s32(__p0), vget_high_s32(__p1));
+  return __ret;
+}
+#else
+__ai int64x2_t vabdl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vabdl_s32(__noswap_vget_high_s32(__rev0), __noswap_vget_high_s32(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabdl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vabdl_s16(vget_high_s16(__p0), vget_high_s16(__p1));
+  return __ret;
+}
+#else
+__ai int32x4_t vabdl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vabdl_s16(__noswap_vget_high_s16(__rev0), __noswap_vget_high_s16(__rev1));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = vmovl_high_u8(__p0) + vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddl_high_u8(uint8x16_t __p0, uint8x16_t __p1) {
+  uint8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmovl_high_u8(__rev0) + __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = vmovl_high_u32(__p0) + vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddl_high_u32(uint32x4_t __p0, uint32x4_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmovl_high_u32(__rev0) + __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = vmovl_high_u16(__p0) + vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddl_high_u16(uint16x8_t __p0, uint16x8_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmovl_high_u16(__rev0) + __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = vmovl_high_s8(__p0) + vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddl_high_s8(int8x16_t __p0, int8x16_t __p1) {
+  int8x16_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmovl_high_s8(__rev0) + __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = vmovl_high_s32(__p0) + vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddl_high_s32(int32x4_t __p0, int32x4_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmovl_high_s32(__rev0) + __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = vmovl_high_s16(__p0) + vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddl_high_s16(int16x8_t __p0, int16x8_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmovl_high_s16(__rev0) + __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vaddw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __ret;
+  __ret = __p0 + vmovl_high_u8(__p1);
+  return __ret;
+}
+#else
+__ai uint16x8_t vaddw_high_u8(uint16x8_t __p0, uint8x16_t __p1) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_u8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vaddw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __ret;
+  __ret = __p0 + vmovl_high_u32(__p1);
+  return __ret;
+}
+#else
+__ai uint64x2_t vaddw_high_u32(uint64x2_t __p0, uint32x4_t __p1) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_u32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vaddw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __ret;
+  __ret = __p0 + vmovl_high_u16(__p1);
+  return __ret;
+}
+#else
+__ai uint32x4_t vaddw_high_u16(uint32x4_t __p0, uint16x8_t __p1) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_u16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vaddw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __ret;
+  __ret = __p0 + vmovl_high_s8(__p1);
+  return __ret;
+}
+#else
+__ai int16x8_t vaddw_high_s8(int16x8_t __p0, int8x16_t __p1) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_s8(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vaddw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __ret;
+  __ret = __p0 + vmovl_high_s32(__p1);
+  return __ret;
+}
+#else
+__ai int64x2_t vaddw_high_s32(int64x2_t __p0, int32x4_t __p1) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_s32(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __ret;
+  __ret = __p0 + vmovl_high_s16(__p1);
+  return __ret;
+}
+#else
+__ai int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vmovl_high_s16(__rev1);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_p64(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \
+  poly64x2_t __s0_266 = __p0_266; \
+  poly64x1_t __s2_266 = __p2_266; \
+  poly64x2_t __ret_266; \
+  __ret_266 = vsetq_lane_p64(vget_lane_p64(__s2_266, __p3_266), __s0_266, __p1_266); \
+  __ret_266; \
+})
+#else
+#define vcopyq_lane_p64(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \
+  poly64x2_t __s0_267 = __p0_267; \
+  poly64x1_t __s2_267 = __p2_267; \
+  poly64x2_t __rev0_267;  __rev0_267 = __builtin_shufflevector(__s0_267, __s0_267, 1, 0); \
+  poly64x2_t __ret_267; \
+  __ret_267 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_267, __p3_267), __rev0_267, __p1_267); \
+  __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \
+  __ret_267; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_lane_f64(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \
+  float64x2_t __s0_268 = __p0_268; \
+  float64x1_t __s2_268 = __p2_268; \
+  float64x2_t __ret_268; \
+  __ret_268 = vsetq_lane_f64(vget_lane_f64(__s2_268, __p3_268), __s0_268, __p1_268); \
+  __ret_268; \
+})
+#else
+#define vcopyq_lane_f64(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \
+  float64x2_t __s0_269 = __p0_269; \
+  float64x1_t __s2_269 = __p2_269; \
+  float64x2_t __rev0_269;  __rev0_269 = __builtin_shufflevector(__s0_269, __s0_269, 1, 0); \
+  float64x2_t __ret_269; \
+  __ret_269 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_269, __p3_269), __rev0_269, __p1_269); \
+  __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 1, 0); \
+  __ret_269; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_p64(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \
+  poly64x1_t __s0_270 = __p0_270; \
+  poly64x1_t __s2_270 = __p2_270; \
+  poly64x1_t __ret_270; \
+  __ret_270 = vset_lane_p64(vget_lane_p64(__s2_270, __p3_270), __s0_270, __p1_270); \
+  __ret_270; \
+})
+#else
+#define vcopy_lane_p64(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \
+  poly64x1_t __s0_271 = __p0_271; \
+  poly64x1_t __s2_271 = __p2_271; \
+  poly64x1_t __ret_271; \
+  __ret_271 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_271, __p3_271), __s0_271, __p1_271); \
+  __ret_271; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_lane_f64(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \
+  float64x1_t __s0_272 = __p0_272; \
+  float64x1_t __s2_272 = __p2_272; \
+  float64x1_t __ret_272; \
+  __ret_272 = vset_lane_f64(vget_lane_f64(__s2_272, __p3_272), __s0_272, __p1_272); \
+  __ret_272; \
+})
+#else
+#define vcopy_lane_f64(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \
+  float64x1_t __s0_273 = __p0_273; \
+  float64x1_t __s2_273 = __p2_273; \
+  float64x1_t __ret_273; \
+  __ret_273 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_273, __p3_273), __s0_273, __p1_273); \
+  __ret_273; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_p64(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \
+  poly64x2_t __s0_274 = __p0_274; \
+  poly64x2_t __s2_274 = __p2_274; \
+  poly64x2_t __ret_274; \
+  __ret_274 = vsetq_lane_p64(vgetq_lane_p64(__s2_274, __p3_274), __s0_274, __p1_274); \
+  __ret_274; \
+})
+#else
+#define vcopyq_laneq_p64(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \
+  poly64x2_t __s0_275 = __p0_275; \
+  poly64x2_t __s2_275 = __p2_275; \
+  poly64x2_t __rev0_275;  __rev0_275 = __builtin_shufflevector(__s0_275, __s0_275, 1, 0); \
+  poly64x2_t __rev2_275;  __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 1, 0); \
+  poly64x2_t __ret_275; \
+  __ret_275 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_275, __p3_275), __rev0_275, __p1_275); \
+  __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \
+  __ret_275; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopyq_laneq_f64(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \
+  float64x2_t __s0_276 = __p0_276; \
+  float64x2_t __s2_276 = __p2_276; \
+  float64x2_t __ret_276; \
+  __ret_276 = vsetq_lane_f64(vgetq_lane_f64(__s2_276, __p3_276), __s0_276, __p1_276); \
+  __ret_276; \
+})
+#else
+#define vcopyq_laneq_f64(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \
+  float64x2_t __s0_277 = __p0_277; \
+  float64x2_t __s2_277 = __p2_277; \
+  float64x2_t __rev0_277;  __rev0_277 = __builtin_shufflevector(__s0_277, __s0_277, 1, 0); \
+  float64x2_t __rev2_277;  __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \
+  float64x2_t __ret_277; \
+  __ret_277 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_277, __p3_277), __rev0_277, __p1_277); \
+  __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 1, 0); \
+  __ret_277; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_p64(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \
+  poly64x1_t __s0_278 = __p0_278; \
+  poly64x2_t __s2_278 = __p2_278; \
+  poly64x1_t __ret_278; \
+  __ret_278 = vset_lane_p64(vgetq_lane_p64(__s2_278, __p3_278), __s0_278, __p1_278); \
+  __ret_278; \
+})
+#else
+#define vcopy_laneq_p64(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \
+  poly64x1_t __s0_279 = __p0_279; \
+  poly64x2_t __s2_279 = __p2_279; \
+  poly64x2_t __rev2_279;  __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 1, 0); \
+  poly64x1_t __ret_279; \
+  __ret_279 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_279, __p3_279), __s0_279, __p1_279); \
+  __ret_279; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vcopy_laneq_f64(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \
+  float64x1_t __s0_280 = __p0_280; \
+  float64x2_t __s2_280 = __p2_280; \
+  float64x1_t __ret_280; \
+  __ret_280 = vset_lane_f64(vgetq_lane_f64(__s2_280, __p3_280), __s0_280, __p1_280); \
+  __ret_280; \
+})
+#else
+#define vcopy_laneq_f64(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \
+  float64x1_t __s0_281 = __p0_281; \
+  float64x2_t __s2_281 = __p2_281; \
+  float64x2_t __rev2_281;  __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 1, 0); \
+  float64x1_t __ret_281; \
+  __ret_281 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_281, __p3_281), __s0_281, __p1_281); \
+  __ret_281; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __ret;
+  __ret = vmlal_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmlal_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlal_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlal_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlal_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlal_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __ret;
+  __ret = vmlal_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vmlal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmlal_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlal_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlal_n_u32(__p0, vget_high_u32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlal_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlal_n_u32(__rev0, __noswap_vget_high_u32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlal_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlal_n_u16(__p0, vget_high_u16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlal_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlal_n_u16(__rev0, __noswap_vget_high_u16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlal_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlal_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlal_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlal_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlal_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlal_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vmlsl_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __ret;
+  __ret = vmlsl_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vmlsl_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vmlsl_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlsl_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlsl_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlsl_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlsl_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vmlsl_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __ret;
+  __ret = vmlsl_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vmlsl_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vmlsl_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlsl_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlsl_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlsl_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlsl_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vmlsl_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __ret;
+  __ret = vmlsl_n_u32(__p0, vget_high_u32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vmlsl_high_n_u32(uint64x2_t __p0, uint32x4_t __p1, uint32_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vmlsl_n_u32(__rev0, __noswap_vget_high_u32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vmlsl_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __ret;
+  __ret = vmlsl_n_u16(__p0, vget_high_u16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vmlsl_high_n_u16(uint32x4_t __p0, uint16x8_t __p1, uint16_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vmlsl_n_u16(__rev0, __noswap_vget_high_u16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __ret;
+  __ret = vmlsl_n_s32(__p0, vget_high_s32(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vmlsl_high_n_s32(int64x2_t __p0, int32x4_t __p1, int32_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vmlsl_n_s32(__rev0, __noswap_vget_high_s32(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __ret;
+  __ret = vmlsl_n_s16(__p0, vget_high_s16(__p1), __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vmlsl_n_s16(__rev0, __noswap_vget_high_s16(__rev1), __p2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_lane_f64(__p0_282, __p1_282, __p2_282) __extension__ ({ \
+  float64x1_t __s0_282 = __p0_282; \
+  float64x1_t __s1_282 = __p1_282; \
+  float64x1_t __ret_282; \
+  float64_t __x_282 = vget_lane_f64(__s0_282, 0); \
+  float64_t __y_282 = vget_lane_f64(__s1_282, __p2_282); \
+  float64_t __z_282 = vmulxd_f64(__x_282, __y_282); \
+  __ret_282 = vset_lane_f64(__z_282, __s0_282, __p2_282); \
+  __ret_282; \
+})
+#else
+#define vmulx_lane_f64(__p0_283, __p1_283, __p2_283) __extension__ ({ \
+  float64x1_t __s0_283 = __p0_283; \
+  float64x1_t __s1_283 = __p1_283; \
+  float64x1_t __ret_283; \
+  float64_t __x_283 = __noswap_vget_lane_f64(__s0_283, 0); \
+  float64_t __y_283 = __noswap_vget_lane_f64(__s1_283, __p2_283); \
+  float64_t __z_283 = __noswap_vmulxd_f64(__x_283, __y_283); \
+  __ret_283 = __noswap_vset_lane_f64(__z_283, __s0_283, __p2_283); \
+  __ret_283; \
+})
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define vmulx_laneq_f64(__p0_284, __p1_284, __p2_284) __extension__ ({ \
+  float64x1_t __s0_284 = __p0_284; \
+  float64x2_t __s1_284 = __p1_284; \
+  float64x1_t __ret_284; \
+  float64_t __x_284 = vget_lane_f64(__s0_284, 0); \
+  float64_t __y_284 = vgetq_lane_f64(__s1_284, __p2_284); \
+  float64_t __z_284 = vmulxd_f64(__x_284, __y_284); \
+  __ret_284 = vset_lane_f64(__z_284, __s0_284, 0); \
+  __ret_284; \
+})
+#else
+#define vmulx_laneq_f64(__p0_285, __p1_285, __p2_285) __extension__ ({ \
+  float64x1_t __s0_285 = __p0_285; \
+  float64x2_t __s1_285 = __p1_285; \
+  float64x2_t __rev1_285;  __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 1, 0); \
+  float64x1_t __ret_285; \
+  float64_t __x_285 = __noswap_vget_lane_f64(__s0_285, 0); \
+  float64_t __y_285 = __noswap_vgetq_lane_f64(__rev1_285, __p2_285); \
+  float64_t __z_285 = __noswap_vmulxd_f64(__x_285, __y_285); \
+  __ret_285 = __noswap_vset_lane_f64(__z_285, __s0_285, 0); \
+  __ret_285; \
+})
+#endif
+
+#endif
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + vabdl_u8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint16x8_t vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdl_u8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint16x8_t __noswap_vabal_u8(uint16x8_t __p0, uint8x8_t __p1, uint8x8_t __p2) {
+  uint16x8_t __ret;
+  __ret = __p0 + __noswap_vabdl_u8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + vabdl_u32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint64x2_t vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  uint32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __rev0 + __noswap_vabdl_u32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai uint64x2_t __noswap_vabal_u32(uint64x2_t __p0, uint32x2_t __p1, uint32x2_t __p2) {
+  uint64x2_t __ret;
+  __ret = __p0 + __noswap_vabdl_u32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + vabdl_u16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai uint32x4_t vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdl_u16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai uint32x4_t __noswap_vabal_u16(uint32x4_t __p0, uint16x4_t __p1, uint16x4_t __p2) {
+  uint32x4_t __ret;
+  __ret = __p0 + __noswap_vabdl_u16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + vabdl_s8(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int16x8_t vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __rev0 + __noswap_vabdl_s8(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int16x8_t __noswap_vabal_s8(int16x8_t __p0, int8x8_t __p1, int8x8_t __p2) {
+  int16x8_t __ret;
+  __ret = __p0 + __noswap_vabdl_s8(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + vabdl_s32(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int64x2_t vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x2_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0);
+  int32x2_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0);
+  int64x2_t __ret;
+  __ret = __rev0 + __noswap_vabdl_s32(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+__ai int64x2_t __noswap_vabal_s32(int64x2_t __p0, int32x2_t __p1, int32x2_t __p2) {
+  int64x2_t __ret;
+  __ret = __p0 + __noswap_vabdl_s32(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + vabdl_s16(__p1, __p2);
+  return __ret;
+}
+#else
+__ai int32x4_t vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int16x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __rev0 + __noswap_vabdl_s16(__rev1, __rev2);
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+__ai int32x4_t __noswap_vabal_s16(int32x4_t __p0, int16x4_t __p1, int16x4_t __p2) {
+  int32x4_t __ret;
+  __ret = __p0 + __noswap_vabdl_s16(__p1, __p2);
+  return __ret;
+}
+#endif
+
+#if defined(__aarch64__)
+#ifdef __LITTLE_ENDIAN__
+__ai uint16x8_t vabal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __ret;
+  __ret = vabal_u8(__p0, vget_high_u8(__p1), vget_high_u8(__p2));
+  return __ret;
+}
+#else
+__ai uint16x8_t vabal_high_u8(uint16x8_t __p0, uint8x16_t __p1, uint8x16_t __p2) {
+  uint16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __ret;
+  __ret = __noswap_vabal_u8(__rev0, __noswap_vget_high_u8(__rev1), __noswap_vget_high_u8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint64x2_t vabal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __ret;
+  __ret = vabal_u32(__p0, vget_high_u32(__p1), vget_high_u32(__p2));
+  return __ret;
+}
+#else
+__ai uint64x2_t vabal_high_u32(uint64x2_t __p0, uint32x4_t __p1, uint32x4_t __p2) {
+  uint64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  uint32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  uint32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  uint64x2_t __ret;
+  __ret = __noswap_vabal_u32(__rev0, __noswap_vget_high_u32(__rev1), __noswap_vget_high_u32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai uint32x4_t vabal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __ret;
+  __ret = vabal_u16(__p0, vget_high_u16(__p1), vget_high_u16(__p2));
+  return __ret;
+}
+#else
+__ai uint32x4_t vabal_high_u16(uint32x4_t __p0, uint16x8_t __p1, uint16x8_t __p2) {
+  uint32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  uint16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  uint32x4_t __ret;
+  __ret = __noswap_vabal_u16(__rev0, __noswap_vget_high_u16(__rev1), __noswap_vget_high_u16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int16x8_t vabal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __ret;
+  __ret = vabal_s8(__p0, vget_high_s8(__p1), vget_high_s8(__p2));
+  return __ret;
+}
+#else
+__ai int16x8_t vabal_high_s8(int16x8_t __p0, int8x16_t __p1, int8x16_t __p2) {
+  int16x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int8x16_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __ret;
+  __ret = __noswap_vabal_s8(__rev0, __noswap_vget_high_s8(__rev1), __noswap_vget_high_s8(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int64x2_t vabal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __ret;
+  __ret = vabal_s32(__p0, vget_high_s32(__p1), vget_high_s32(__p2));
+  return __ret;
+}
+#else
+__ai int64x2_t vabal_high_s32(int64x2_t __p0, int32x4_t __p1, int32x4_t __p2) {
+  int64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  int32x4_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0);
+  int32x4_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0);
+  int64x2_t __ret;
+  __ret = __noswap_vabal_s32(__rev0, __noswap_vget_high_s32(__rev1), __noswap_vget_high_s32(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+__ai int32x4_t vabal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __ret;
+  __ret = vabal_s16(__p0, vget_high_s16(__p1), vget_high_s16(__p2));
+  return __ret;
+}
+#else
+__ai int32x4_t vabal_high_s16(int32x4_t __p0, int16x8_t __p1, int16x8_t __p2) {
+  int32x4_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
+  int16x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0);
+  int16x8_t __rev2;  __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0);
+  int32x4_t __ret;
+  __ret = __noswap_vabal_s16(__rev0, __noswap_vget_high_s16(__rev1), __noswap_vget_high_s16(__rev2));
+  __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
+  return __ret;
+}
+#endif
+
+#endif
+
+#undef __ai
+
+#endif /* __ARM_NEON_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/armintr.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/armintr.h
new file mode 100644
index 000000000..933afcbb9
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/armintr.h
@@ -0,0 +1,45 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+  _ARM_BARRIER_SY    = 0xF,
+  _ARM_BARRIER_ST    = 0xE,
+  _ARM_BARRIER_ISH   = 0xB,
+  _ARM_BARRIER_ISHST = 0xA,
+  _ARM_BARRIER_NSH   = 0x7,
+  _ARM_BARRIER_NSHST = 0x6,
+  _ARM_BARRIER_OSH   = 0x3,
+  _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx2intrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx2intrin.h
new file mode 100644
index 000000000..13bcbef4d
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx2intrin.h
@@ -0,0 +1,1299 @@
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm256_mpsadbw_epu8(X, Y, M) \
+  (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+                                     (__v32qi)(__m256i)(Y), (int)(M))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qu)__a + (__v32qu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hu)__a + (__v16hu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8su)__a + (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a + (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+#define _mm256_alignr_epi8(a, b, n) __extension__ ({        \
+  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                     (__v32qi)(__m256i)(b), (n)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a & (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)(~(__v4du)__a & (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+                                              (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) __extension__ ({       \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
+                                   (__v16hi)(__m256i)(V2),   \
+                                   (((M) & 0x01) ? 16 : 0),  \
+                                   (((M) & 0x02) ? 17 : 1),  \
+                                   (((M) & 0x04) ? 18 : 2),  \
+                                   (((M) & 0x08) ? 19 : 3),  \
+                                   (((M) & 0x10) ? 20 : 4),  \
+                                   (((M) & 0x20) ? 21 : 5),  \
+                                   (((M) & 0x40) ? 22 : 6),  \
+                                   (((M) & 0x80) ? 23 : 7),  \
+                                   (((M) & 0x01) ? 24 : 8),  \
+                                   (((M) & 0x02) ? 25 : 9),  \
+                                   (((M) & 0x04) ? 26 : 10), \
+                                   (((M) & 0x08) ? 27 : 11), \
+                                   (((M) & 0x10) ? 28 : 12), \
+                                   (((M) & 0x20) ? 29 : 13), \
+                                   (((M) & 0x40) ? 30 : 14), \
+                                   (((M) & 0x80) ? 31 : 15)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4di)__a == (__v4di)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+  /* This function always performs a signed comparison, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m256i)((__v32qs)__a > (__v32qs)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4di)__a > (__v4di)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm256_movemask_epi8(__m256i __a)
+{
+  return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hu)__a * (__v16hu)__b);
+}
+
+static __inline__  __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8su)__a * (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a | (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
+                                   (__v8si)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })
+
+#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_undefined_si256(), \
+                                   0, 1, 2, 3, \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3)); })
+
+#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
+                                   (__v16hi)_mm256_undefined_si256(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
+                                   12, 13, 14, 15); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector(                                          \
+        (__v32qi)_mm256_setzero_si256(),                                     \
+        (__v32qi)(__m256i)(a),                                               \
+        ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 : 32) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 : 33) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 : 34) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 : 35) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 : 36) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 : 37) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 : 38) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 : 39) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 : 40) - (char)(imm), \
+        ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 : 41) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 42) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 43) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 44) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 45) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 46) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 47) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 48) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 49) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 50) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 51) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 52) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 53) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 54) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 55) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 56) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 57) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 58) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 59) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \
+        ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); })
+
+#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psllqi256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector(                                           \
+        (__v32qi)(__m256i)(a),                                               \
+        (__v32qi)_mm256_setzero_si256(),                                     \
+        ((char)(imm)&0xF0) ? 32 : (char)(imm) + ((char)(imm)>0xF ? 16 : 0),  \
+        ((char)(imm)&0xF0) ? 33 : (char)(imm) + ((char)(imm)>0xE ? 17 : 1),  \
+        ((char)(imm)&0xF0) ? 34 : (char)(imm) + ((char)(imm)>0xD ? 18 : 2),  \
+        ((char)(imm)&0xF0) ? 35 : (char)(imm) + ((char)(imm)>0xC ? 19 : 3),  \
+        ((char)(imm)&0xF0) ? 36 : (char)(imm) + ((char)(imm)>0xB ? 20 : 4),  \
+        ((char)(imm)&0xF0) ? 37 : (char)(imm) + ((char)(imm)>0xA ? 21 : 5),  \
+        ((char)(imm)&0xF0) ? 38 : (char)(imm) + ((char)(imm)>0x9 ? 22 : 6),  \
+        ((char)(imm)&0xF0) ? 39 : (char)(imm) + ((char)(imm)>0x8 ? 23 : 7),  \
+        ((char)(imm)&0xF0) ? 40 : (char)(imm) + ((char)(imm)>0x7 ? 24 : 8),  \
+        ((char)(imm)&0xF0) ? 41 : (char)(imm) + ((char)(imm)>0x6 ? 25 : 9),  \
+        ((char)(imm)&0xF0) ? 42 : (char)(imm) + ((char)(imm)>0x5 ? 26 : 10), \
+        ((char)(imm)&0xF0) ? 43 : (char)(imm) + ((char)(imm)>0x4 ? 27 : 11), \
+        ((char)(imm)&0xF0) ? 44 : (char)(imm) + ((char)(imm)>0x3 ? 28 : 12), \
+        ((char)(imm)&0xF0) ? 45 : (char)(imm) + ((char)(imm)>0x2 ? 29 : 13), \
+        ((char)(imm)&0xF0) ? 46 : (char)(imm) + ((char)(imm)>0x1 ? 30 : 14), \
+        ((char)(imm)&0xF0) ? 47 : (char)(imm) + ((char)(imm)>0x0 ? 31 : 15), \
+        ((char)(imm)&0xF0) ? 48 : (char)(imm) + ((char)(imm)>0xF ? 32 : 16), \
+        ((char)(imm)&0xF0) ? 49 : (char)(imm) + ((char)(imm)>0xE ? 33 : 17), \
+        ((char)(imm)&0xF0) ? 50 : (char)(imm) + ((char)(imm)>0xD ? 34 : 18), \
+        ((char)(imm)&0xF0) ? 51 : (char)(imm) + ((char)(imm)>0xC ? 35 : 19), \
+        ((char)(imm)&0xF0) ? 52 : (char)(imm) + ((char)(imm)>0xB ? 36 : 20), \
+        ((char)(imm)&0xF0) ? 53 : (char)(imm) + ((char)(imm)>0xA ? 37 : 21), \
+        ((char)(imm)&0xF0) ? 54 : (char)(imm) + ((char)(imm)>0x9 ? 38 : 22), \
+        ((char)(imm)&0xF0) ? 55 : (char)(imm) + ((char)(imm)>0x8 ? 39 : 23), \
+        ((char)(imm)&0xF0) ? 56 : (char)(imm) + ((char)(imm)>0x7 ? 40 : 24), \
+        ((char)(imm)&0xF0) ? 57 : (char)(imm) + ((char)(imm)>0x6 ? 41 : 25), \
+        ((char)(imm)&0xF0) ? 58 : (char)(imm) + ((char)(imm)>0x5 ? 42 : 26), \
+        ((char)(imm)&0xF0) ? 59 : (char)(imm) + ((char)(imm)>0x4 ? 43 : 27), \
+        ((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \
+        ((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \
+        ((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \
+        ((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); })
+
+#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+  return __builtin_ia32_psrlqi256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v32qu)__a - (__v32qu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v16hu)__a - (__v16hu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v8su)__a - (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a - (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+  return (__m256i)((__v4du)__a ^ (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_stream_load_si256(__m256i const *__V)
+{
+  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_broadcastss_ps(__m128 __X)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_broadcastsd_pd(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcastss_ps(__m128 __X)
+{
+  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcastsd_pd(__m128d __X)
+{
+  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
+}
+
+#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
+                                   (__v4si)(__m128i)(V2),  \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
+                                   (__v8si)(__m256i)(V2),   \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastb_epi8(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastw_epi16(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastd_epi32(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastq_epi64(__m128i __X)
+{
+  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastb_epi8(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastw_epi16(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastd_epi32(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastq_epi64(__m128i __X)
+{
+  return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
+{
+  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((M) >> 0) & 0x3, \
+                                   ((M) >> 2) & 0x3, \
+                                   ((M) >> 4) & 0x3, \
+                                   ((M) >> 6) & 0x3); })
+
+#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })
+
+#define _mm256_extracti128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   (((M) & 1) ? 2 : 0), \
+                                   (((M) & 1) ? 3 : 1) ); })
+
+#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
+                                   (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+                                   (((M) & 1) ? 0 : 4), \
+                                   (((M) & 1) ? 1 : 5), \
+                                   (((M) & 1) ? 4 : 2), \
+                                   (((M) & 1) ? 5 : 3) ); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+  __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)(__m128d)(mask), (s)); })
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)(__m256d)(mask), (s)); })
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)(__m256)(mask), (s)); })
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)(__m128)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                       (int const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8si)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                    (int const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                       (int const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4si)(__m128i)(mask), (s)); })
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)(__m128i)(mask), (s)); })
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)(__m256i)(mask), (s)); })
+
+#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                     (double const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                          _mm_setzero_pd()), \
+                                     (s)); })
+
+#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
+  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                        (double const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                              _mm256_setzero_pd(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)); })
+
+#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v8si)(__m256i)(i), \
+                                       (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                             _mm256_setzero_ps(), \
+                                                             _CMP_EQ_OQ), \
+                                       (s)); })
+
+#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                    (float const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                         _mm_setzero_ps()), \
+                                    (s)); })
+
+#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
+  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                       (float const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                            _mm_setzero_ps()), \
+                                       (s)); })
+
+#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v4si)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                       (int const *)(m), (__v8si)(__m256i)(i), \
+                                       (__v8si)_mm256_set1_epi32(-1), (s)); })
+
+#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                    (int const *)(m), (__v2di)(__m128i)(i), \
+                                    (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                       (int const *)(m), (__v4di)(__m256i)(i), \
+                                       (__v4si)_mm_set1_epi32(-1), (s)); })
+
+#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v4si)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4si)(__m128i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                    (long long const *)(m), \
+                                    (__v2di)(__m128i)(i), \
+                                    (__v2di)_mm_set1_epi64x(-1), (s)); })
+
+#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
+  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                       (long long const *)(m), \
+                                       (__v4di)(__m256i)(i), \
+                                       (__v4di)_mm256_set1_epi64x(-1), (s)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX2INTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512bwintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512bwintrin.h
new file mode 100644
index 000000000..629dc8611
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512bwintrin.h
@@ -0,0 +1,2360 @@
+/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BWINTRIN_H
+#define __AVX512BWINTRIN_H
+
+typedef unsigned int __mmask32;
+typedef unsigned long long __mmask64;
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_qi(void) {
+  return (__m512i)(__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_hi(void) {
+  return (__m512i)(__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/* Integer compare */
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 (__mmask64)-1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
+  return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
+  return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qu) __A + (__v64qu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_add_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi8 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v64qu) __A - (__v64qu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_sub_epi8(__A, __B),
+                                             (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hu) __A + (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_add_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hu) __A - (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_sub_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v32hu) __A * (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_mullo_epi16(__A, __B),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+              (__v64qi) __W,
+              (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+              (__v32hi) __W,
+              (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi8 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi16 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v32hi) __W,
+              __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_packus_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shuffle_epi8(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                         (__v64qi)_mm512_shuffle_epi8(__A, __B),
+                                         (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) __W,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+              (__v64qi) __B,
+              (__v64qi) _mm512_setzero_qi(),
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
+         __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+              (__v32hi) __I /* idx */ ,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
+              /* idx */ ,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) __W,
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                (__v32hi) __B,
+                (__v32hi) _mm512_setzero_hi(),
+                (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) __W,
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+              (__v32hi) __B,
+              (__v32hi) _mm512_setzero_hi(),
+              (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) __W,
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v32hi) _mm512_setzero_hi(),
+               (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
+         __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) {
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                 (__v64qi) __Y,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd_epi16 (__m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
+      __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+               (__v32hi) __B,
+               (__v16si) _mm512_setzero_si512(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)_mm256_setzero_si256(),
+               (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi)__O,
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+               (__v32qi) _mm256_setzero_si256(),
+               __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                (__v32qi) _mm256_setzero_si256(),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi8 (__m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) __O,
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+              (__v32qi) _mm256_setzero_si256(),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+                                          8,  64+8,   9, 64+9,
+                                          10, 64+10, 11, 64+11,
+                                          12, 64+12, 13, 64+13,
+                                          14, 64+14, 15, 64+15,
+                                          24, 64+24, 25, 64+25,
+                                          26, 64+26, 27, 64+27,
+                                          28, 64+28, 29, 64+29,
+                                          30, 64+30, 31, 64+31,
+                                          40, 64+40, 41, 64+41,
+                                          42, 64+42, 43, 64+43,
+                                          44, 64+44, 45, 64+45,
+                                          46, 64+46, 47, 64+47,
+                                          56, 64+56, 57, 64+57,
+                                          58, 64+58, 59, 64+59,
+                                          60, 64+60, 61, 64+61,
+                                          62, 64+62, 63, 64+63);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+                                        (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+                                          4,  32+4,   5, 32+5,
+                                          6,  32+6,   7, 32+7,
+                                          12, 32+12, 13, 32+13,
+                                          14, 32+14, 15, 32+15,
+                                          20, 32+20, 21, 32+21,
+                                          22, 32+22, 23, 32+23,
+                                          28, 32+28, 29, 32+29,
+                                          30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+                                       (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+                                          0,  64+0,   1, 64+1,
+                                          2,  64+2,   3, 64+3,
+                                          4,  64+4,   5, 64+5,
+                                          6,  64+6,   7, 64+7,
+                                          16, 64+16, 17, 64+17,
+                                          18, 64+18, 19, 64+19,
+                                          20, 64+20, 21, 64+21,
+                                          22, 64+22, 23, 64+23,
+                                          32, 64+32, 33, 64+33,
+                                          34, 64+34, 35, 64+35,
+                                          36, 64+36, 37, 64+37,
+                                          38, 64+38, 39, 64+39,
+                                          48, 64+48, 49, 64+49,
+                                          50, 64+50, 51, 64+51,
+                                          52, 64+52, 53, 64+53,
+                                          54, 64+54, 55, 64+55);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+                                        (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                        (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+                                          0,  32+0,   1, 32+1,
+                                          2,  32+2,   3, 32+3,
+                                          8,  32+8,   9, 32+9,
+                                          10, 32+10, 11, 32+11,
+                                          16, 32+16, 17, 32+17,
+                                          18, 32+18, 19, 32+19,
+                                          24, 32+24, 25, 32+25,
+                                          26, 32+26, 27, 32+27);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+                                       (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                       (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi16(__m256i __A)
+{
+  /* This function always performs a signed extension, but __v32qi is a char
+     which may be signed or unsigned, so use __v32qs. */
+  return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepi8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi16(__m256i __A)
+{
+  return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_cvtepu8_epi16(__A),
+                                             (__v32hi)_mm512_setzero_hi());
+}
+
+
+#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                         (__v64qi)(__m512i)(b), (int)(p), \
+                                         (__mmask64)(m)); })
+
+#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)-1); })
+
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)(m)); })
+
+#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                         (__v32hi)(__m512i)(b), (int)(p), \
+                                         (__mmask32)(m)); })
+
+#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)-1); })
+
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)(m)); })
+
+#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
+                                   0, 1, 2, 3, \
+                                   4  + (((imm) >> 0) & 0x3), \
+                                   4  + (((imm) >> 2) & 0x3), \
+                                   4  + (((imm) >> 4) & 0x3), \
+                                   4  + (((imm) >> 6) & 0x3), \
+                                   8, 9, 10, 11, \
+                                   12 + (((imm) >> 0) & 0x3), \
+                                   12 + (((imm) >> 2) & 0x3), \
+                                   12 + (((imm) >> 4) & 0x3), \
+                                   12 + (((imm) >> 6) & 0x3), \
+                                   16, 17, 18, 19, \
+                                   20 + (((imm) >> 0) & 0x3), \
+                                   20 + (((imm) >> 2) & 0x3), \
+                                   20 + (((imm) >> 4) & 0x3), \
+                                   20 + (((imm) >> 6) & 0x3), \
+                                   24, 25, 26, 27, \
+                                   28 + (((imm) >> 0) & 0x3), \
+                                   28 + (((imm) >> 2) & 0x3), \
+                                   28 + (((imm) >> 4) & 0x3), \
+                                   28 + (((imm) >> 6) & 0x3)); })
+
+#define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)(__m512i)(W)); })
+
+#define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)_mm512_setzero_hi()); })
+
+#define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
+                                   (__v32hi)_mm512_undefined_epi32(), \
+                                   0 + (((imm) >> 0) & 0x3), \
+                                   0 + (((imm) >> 2) & 0x3), \
+                                   0 + (((imm) >> 4) & 0x3), \
+                                   0 + (((imm) >> 6) & 0x3), \
+                                   4, 5, 6, 7, \
+                                   8 + (((imm) >> 0) & 0x3), \
+                                   8 + (((imm) >> 2) & 0x3), \
+                                   8 + (((imm) >> 4) & 0x3), \
+                                   8 + (((imm) >> 6) & 0x3), \
+                                   12, 13, 14, 15, \
+                                   16 + (((imm) >> 0) & 0x3), \
+                                   16 + (((imm) >> 2) & 0x3), \
+                                   16 + (((imm) >> 4) & 0x3), \
+                                   16 + (((imm) >> 6) & 0x3), \
+                                   20, 21, 22, 23, \
+                                   24 + (((imm) >> 0) & 0x3), \
+                                   24 + (((imm) >> 2) & 0x3), \
+                                   24 + (((imm) >> 4) & 0x3), \
+                                   24 + (((imm) >> 6) & 0x3), \
+                                   28, 29, 30, 31); })
+
+
+#define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)(__m512i)(W)); })
+
+
+#define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                      (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v32hi)_mm512_setzero_hi()); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi16(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_sllv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi16(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sll_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_slli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
+
+#define _mm512_bslli_epi128(a, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector(                                          \
+       (__v64qi)_mm512_setzero_si512(),                                      \
+       (__v64qi)(__m512i)(a),                                                \
+       ((char)(imm)&0xF0) ?  0 : ((char)(imm)>0x0 ? 16 :  64) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  1 : ((char)(imm)>0x1 ? 17 :  65) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  2 : ((char)(imm)>0x2 ? 18 :  66) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  3 : ((char)(imm)>0x3 ? 19 :  67) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  4 : ((char)(imm)>0x4 ? 20 :  68) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  5 : ((char)(imm)>0x5 ? 21 :  69) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  6 : ((char)(imm)>0x6 ? 22 :  70) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  7 : ((char)(imm)>0x7 ? 23 :  71) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  8 : ((char)(imm)>0x8 ? 24 :  72) - (char)(imm), \
+       ((char)(imm)&0xF0) ?  9 : ((char)(imm)>0x9 ? 25 :  73) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 :  74) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 :  75) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 :  76) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 :  77) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 :  78) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 :  79) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 :  80) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 :  81) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 :  82) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 :  83) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 :  84) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 :  85) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 :  86) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 :  87) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 :  88) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 :  89) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 :  90) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 :  91) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 :  92) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 :  93) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 :  94) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 :  95) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 32 : ((char)(imm)>0x0 ? 48 :  96) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 33 : ((char)(imm)>0x1 ? 49 :  97) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 34 : ((char)(imm)>0x2 ? 50 :  98) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 35 : ((char)(imm)>0x3 ? 51 :  99) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 36 : ((char)(imm)>0x4 ? 52 : 100) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 37 : ((char)(imm)>0x5 ? 53 : 101) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 38 : ((char)(imm)>0x6 ? 54 : 102) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 39 : ((char)(imm)>0x7 ? 55 : 103) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 40 : ((char)(imm)>0x8 ? 56 : 104) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 41 : ((char)(imm)>0x9 ? 57 : 105) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 42 : ((char)(imm)>0xA ? 58 : 106) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 43 : ((char)(imm)>0xB ? 59 : 107) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 44 : ((char)(imm)>0xC ? 60 : 108) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 45 : ((char)(imm)>0xD ? 61 : 109) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 46 : ((char)(imm)>0xE ? 62 : 110) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 47 : ((char)(imm)>0xF ? 63 : 111) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 48 : ((char)(imm)>0x0 ? 64 : 112) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 49 : ((char)(imm)>0x1 ? 65 : 113) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 50 : ((char)(imm)>0x2 ? 66 : 114) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 51 : ((char)(imm)>0x3 ? 67 : 115) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 52 : ((char)(imm)>0x4 ? 68 : 116) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 53 : ((char)(imm)>0x5 ? 69 : 117) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 54 : ((char)(imm)>0x6 ? 70 : 118) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 55 : ((char)(imm)>0x7 ? 71 : 119) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 56 : ((char)(imm)>0x8 ? 72 : 120) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 57 : ((char)(imm)>0x9 ? 73 : 121) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 58 : ((char)(imm)>0xA ? 74 : 122) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 59 : ((char)(imm)>0xB ? 75 : 123) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \
+       ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi16(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srlv_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi16(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                           (__v32hi)_mm512_srav_epi16(__A, __B),
+                                           (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi16(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_sra_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srai_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi16(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                          (__v32hi)_mm512_srl_epi16(__A, __B),
+                                          (__v32hi)_mm512_setzero_hi());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi16(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)_mm512_setzero_hi());
+}
+
+#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector(                     \
+      (__v64qi)(__m512i)(a),                      \
+      (__v64qi)_mm512_setzero_si512(),            \
+      ((char)(imm)&0xF0) ?  64 : (char)(imm) + ((char)(imm)>0xF ?  48 : 0),  \
+      ((char)(imm)&0xF0) ?  65 : (char)(imm) + ((char)(imm)>0xE ?  49 : 1),  \
+      ((char)(imm)&0xF0) ?  66 : (char)(imm) + ((char)(imm)>0xD ?  50 : 2),  \
+      ((char)(imm)&0xF0) ?  67 : (char)(imm) + ((char)(imm)>0xC ?  51 : 3),  \
+      ((char)(imm)&0xF0) ?  68 : (char)(imm) + ((char)(imm)>0xB ?  52 : 4),  \
+      ((char)(imm)&0xF0) ?  69 : (char)(imm) + ((char)(imm)>0xA ?  53 : 5),  \
+      ((char)(imm)&0xF0) ?  70 : (char)(imm) + ((char)(imm)>0x9 ?  54 : 6),  \
+      ((char)(imm)&0xF0) ?  71 : (char)(imm) + ((char)(imm)>0x8 ?  55 : 7),  \
+      ((char)(imm)&0xF0) ?  72 : (char)(imm) + ((char)(imm)>0x7 ?  56 : 8),  \
+      ((char)(imm)&0xF0) ?  73 : (char)(imm) + ((char)(imm)>0x6 ?  57 : 9),  \
+      ((char)(imm)&0xF0) ?  74 : (char)(imm) + ((char)(imm)>0x5 ?  58 : 10), \
+      ((char)(imm)&0xF0) ?  75 : (char)(imm) + ((char)(imm)>0x4 ?  59 : 11), \
+      ((char)(imm)&0xF0) ?  76 : (char)(imm) + ((char)(imm)>0x3 ?  60 : 12), \
+      ((char)(imm)&0xF0) ?  77 : (char)(imm) + ((char)(imm)>0x2 ?  61 : 13), \
+      ((char)(imm)&0xF0) ?  78 : (char)(imm) + ((char)(imm)>0x1 ?  62 : 14), \
+      ((char)(imm)&0xF0) ?  79 : (char)(imm) + ((char)(imm)>0x0 ?  63 : 15), \
+      ((char)(imm)&0xF0) ?  80 : (char)(imm) + ((char)(imm)>0xF ?  64 : 16), \
+      ((char)(imm)&0xF0) ?  81 : (char)(imm) + ((char)(imm)>0xE ?  65 : 17), \
+      ((char)(imm)&0xF0) ?  82 : (char)(imm) + ((char)(imm)>0xD ?  66 : 18), \
+      ((char)(imm)&0xF0) ?  83 : (char)(imm) + ((char)(imm)>0xC ?  67 : 19), \
+      ((char)(imm)&0xF0) ?  84 : (char)(imm) + ((char)(imm)>0xB ?  68 : 20), \
+      ((char)(imm)&0xF0) ?  85 : (char)(imm) + ((char)(imm)>0xA ?  69 : 21), \
+      ((char)(imm)&0xF0) ?  86 : (char)(imm) + ((char)(imm)>0x9 ?  70 : 22), \
+      ((char)(imm)&0xF0) ?  87 : (char)(imm) + ((char)(imm)>0x8 ?  71 : 23), \
+      ((char)(imm)&0xF0) ?  88 : (char)(imm) + ((char)(imm)>0x7 ?  72 : 24), \
+      ((char)(imm)&0xF0) ?  89 : (char)(imm) + ((char)(imm)>0x6 ?  73 : 25), \
+      ((char)(imm)&0xF0) ?  90 : (char)(imm) + ((char)(imm)>0x5 ?  74 : 26), \
+      ((char)(imm)&0xF0) ?  91 : (char)(imm) + ((char)(imm)>0x4 ?  75 : 27), \
+      ((char)(imm)&0xF0) ?  92 : (char)(imm) + ((char)(imm)>0x3 ?  76 : 28), \
+      ((char)(imm)&0xF0) ?  93 : (char)(imm) + ((char)(imm)>0x2 ?  77 : 29), \
+      ((char)(imm)&0xF0) ?  94 : (char)(imm) + ((char)(imm)>0x1 ?  78 : 30), \
+      ((char)(imm)&0xF0) ?  95 : (char)(imm) + ((char)(imm)>0x0 ?  79 : 31), \
+      ((char)(imm)&0xF0) ?  96 : (char)(imm) + ((char)(imm)>0xF ?  80 : 32), \
+      ((char)(imm)&0xF0) ?  97 : (char)(imm) + ((char)(imm)>0xE ?  81 : 33), \
+      ((char)(imm)&0xF0) ?  98 : (char)(imm) + ((char)(imm)>0xD ?  82 : 34), \
+      ((char)(imm)&0xF0) ?  99 : (char)(imm) + ((char)(imm)>0xC ?  83 : 35), \
+      ((char)(imm)&0xF0) ? 100 : (char)(imm) + ((char)(imm)>0xB ?  84 : 36), \
+      ((char)(imm)&0xF0) ? 101 : (char)(imm) + ((char)(imm)>0xA ?  85 : 37), \
+      ((char)(imm)&0xF0) ? 102 : (char)(imm) + ((char)(imm)>0x9 ?  86 : 38), \
+      ((char)(imm)&0xF0) ? 103 : (char)(imm) + ((char)(imm)>0x8 ?  87 : 39), \
+      ((char)(imm)&0xF0) ? 104 : (char)(imm) + ((char)(imm)>0x7 ?  88 : 40), \
+      ((char)(imm)&0xF0) ? 105 : (char)(imm) + ((char)(imm)>0x6 ?  89 : 41), \
+      ((char)(imm)&0xF0) ? 106 : (char)(imm) + ((char)(imm)>0x5 ?  90 : 42), \
+      ((char)(imm)&0xF0) ? 107 : (char)(imm) + ((char)(imm)>0x4 ?  91 : 43), \
+      ((char)(imm)&0xF0) ? 108 : (char)(imm) + ((char)(imm)>0x3 ?  92 : 44), \
+      ((char)(imm)&0xF0) ? 109 : (char)(imm) + ((char)(imm)>0x2 ?  93 : 45), \
+      ((char)(imm)&0xF0) ? 110 : (char)(imm) + ((char)(imm)>0x1 ?  94 : 46), \
+      ((char)(imm)&0xF0) ? 111 : (char)(imm) + ((char)(imm)>0x0 ?  95 : 47), \
+      ((char)(imm)&0xF0) ? 112 : (char)(imm) + ((char)(imm)>0xF ?  96 : 48), \
+      ((char)(imm)&0xF0) ? 113 : (char)(imm) + ((char)(imm)>0xE ?  97 : 49), \
+      ((char)(imm)&0xF0) ? 114 : (char)(imm) + ((char)(imm)>0xD ?  98 : 50), \
+      ((char)(imm)&0xF0) ? 115 : (char)(imm) + ((char)(imm)>0xC ?  99 : 51), \
+      ((char)(imm)&0xF0) ? 116 : (char)(imm) + ((char)(imm)>0xB ? 100 : 52), \
+      ((char)(imm)&0xF0) ? 117 : (char)(imm) + ((char)(imm)>0xA ? 101 : 53), \
+      ((char)(imm)&0xF0) ? 118 : (char)(imm) + ((char)(imm)>0x9 ? 102 : 54), \
+      ((char)(imm)&0xF0) ? 119 : (char)(imm) + ((char)(imm)>0x8 ? 103 : 55), \
+      ((char)(imm)&0xF0) ? 120 : (char)(imm) + ((char)(imm)>0x7 ? 104 : 56), \
+      ((char)(imm)&0xF0) ? 121 : (char)(imm) + ((char)(imm)>0x6 ? 105 : 57), \
+      ((char)(imm)&0xF0) ? 122 : (char)(imm) + ((char)(imm)>0x5 ? 106 : 58), \
+      ((char)(imm)&0xF0) ? 123 : (char)(imm) + ((char)(imm)>0x4 ? 107 : 59), \
+      ((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \
+      ((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \
+      ((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \
+      ((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+                (__v32hi) __A,
+                (__v32hi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+                (__v32hi) __A,
+                (__v32hi) _mm512_setzero_hi ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+                (__v64qi) __A,
+                (__v64qi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+                (__v64qi) __A,
+                (__v64qi) _mm512_setzero_hi ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                 (__v64qi) __O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                 (__v64qi)
+                 _mm512_setzero_qi(),
+                 __M);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+                (__mmask64) __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+                (__mmask32) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+                 (__v32hi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+                 (__v32hi)
+                 _mm512_setzero_hi (),
+                 (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+                 (__v64qi) __W,
+                 (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+                 (__v64qi)
+                 _mm512_setzero_hi (),
+                 (__mmask64) __U);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
+{
+  __builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
+             (__v32hi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
+{
+  __builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
+             (__v64qi) __A,
+             (__mmask64) __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_test_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B,
+            (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+            (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_test_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+            (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B,
+             (__mmask64) -1);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+             (__v64qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+             (__v32hi) __B, __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_movepi8_mask (__m512i __A)
+{
+  return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_movepi16_mask (__m512i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi8 (__mmask64 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi16 (__mmask32 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastb_epi8 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v16qi) __A,
+                                          (__v16qi)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__M,
+                                             (__v64qi) _mm512_broadcastb_epi8(__A),
+                                             (__v64qi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__M,
+                                             (__v64qi) _mm512_broadcastb_epi8(__A),
+                                             (__v64qi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) __O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastw_epi16 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v8hi) __A,
+                                          (__v8hi)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__M,
+                                             (__v32hi) _mm512_broadcastw_epi16(__A),
+                                             (__v32hi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__M,
+                                             (__v32hi) _mm512_broadcastw_epi16(__A),
+                                             (__v32hi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_undefined_epi32 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) _mm512_setzero_hi(),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                 (__v32hi) __A,
+                 (__v32hi) __W,
+                 (__mmask32) __M);
+}
+
+#define _mm512_alignr_epi8(A, B, N) __extension__ ({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)_mm512_undefined_pd(), \
+                                          (__mmask64)-1); })
+
+#define _mm512_mask_alignr_epi8(W, U, A, B, N) __extension__({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)(__m512i)(W), \
+                                          (__mmask64)(U)); })
+
+#define _mm512_maskz_alignr_epi8(U, A, B, N) __extension__({\
+  (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
+                                          (__v64qi)(__m512i)(B), (int)(N), \
+                                          (__v64qi)_mm512_setzero_si512(), \
+                                          (__mmask64)(U)); })
+
+#define _mm512_dbsad_epu8(A, B, imm) __extension__ ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)_mm512_undefined_epi32(), \
+                                           (__mmask32)-1); })
+
+#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)(__m512i)(W), \
+                                           (__mmask32)(U)); })
+
+#define _mm512_maskz_dbsad_epu8(U, A, B, imm) ({\
+  (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
+                                           (__v64qi)(__m512i)(B), (int)(imm), \
+                                           (__v32hi)_mm512_setzero_hi(), \
+                                           (__mmask32)(U)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sad_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
+               (__v64qi) __B);
+}
+
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512cdintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512cdintrin.h
new file mode 100644
index 000000000..23c423584
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512cdintrin.h
@@ -0,0 +1,144 @@
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+               (__v8di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512dqintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512dqintrin.h
new file mode 100644
index 000000000..ae44b98a9
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512dqintrin.h
@@ -0,0 +1,1332 @@
+/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512DQINTRIN_H
+#define __AVX512DQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
+  return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_mullo_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_xor_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A ^ (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_xor_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_xor_ps (__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A ^ (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_xor_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_or_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A | (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_or_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_or_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A | (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_or_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_and_pd(__m512d __A, __m512d __B) {
+  return (__m512d)((__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_and_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_and_ps(__m512 __A, __m512 __B) {
+  return (__m512)((__v16su)__A & (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_and_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_andnot_pd(__m512d __A, __m512d __B) {
+  return (__m512d)(~(__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_andnot_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_andnot_ps(__m512 __A, __m512 __B) {
+  return (__m512)(~(__v16su)__A & (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_andnot_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu64(A, R) __extension__ ({               \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) __extension__ ({     \
+  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                (__v8di) _mm512_setzero_si512(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi64(A, R) __extension__ ({             \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epi64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                           (__v8di)_mm512_setzero_si512(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epu64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                (__v8df) _mm512_setzero_pd(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_pd(A, R) __extension__ ({          \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) __W,
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+               (__v8sf) _mm256_setzero_ps(),
+               (__mmask8) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ps(A, R) __extension__ ({        \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi64(A, R) __extension__ ({             \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu64 (__m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epu64(A, R) __extension__ ({              \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) __extension__ ({   \
+  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                 (__v8di) _mm512_setzero_si512(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi64(A, R) __extension__ ({            \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) __extension__ ({  \
+  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu64 (__m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                  (__v8di) _mm512_setzero_si512(),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epu64(A, R) __extension__ ({            \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) __extension__ ({  \
+  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_pd (__m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                 (__v8df) _mm512_setzero_pd(),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_pd(A, R) __extension__ ({          \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtepu64_ps (__m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                (__v8sf) _mm256_setzero_ps(),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ps(A, R) __extension__ ({         \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                           (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_range_pd(A, B, C) __extension__ ({                     \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_range_pd(W, U, A, B, C) __extension__ ({      \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_range_pd(U, A, B, C) __extension__ ({           \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_range_round_pd(A, B, C, R) __extension__ ({           \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_range_round_pd(W, U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_range_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(C), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm512_range_ps(A, B, C) __extension__ ({                       \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_range_ps(W, U, A, B, C) __extension__ ({         \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_range_ps(U, A, B, C) __extension__ ({      \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_range_round_ps(A, B, C, R) __extension__ ({         \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_range_round_ps(W, U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_range_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(C), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U), (int)(R)); })
+
+#define _mm_range_round_ss(A, B, C, R) __extension__ ({           \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8) -1, (int)(C),\
+                                               (int)(R)); })
+
+#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_range_round_ss(W, U, A, B, C, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W),\
+                                               (__mmask8)(U), (int)(C),\
+                                               (int)(R)); })
+
+#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_range_round_ss(U, A, B, C, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(C),\
+                                               (int)(R)); })
+
+#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_range_round_sd(A, B, C, R) __extension__ ({           \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8) -1, (int)(C),\
+                                                (int)(R)); })
+
+#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_range_round_sd(W, U, A, B, C, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W),\
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)); })
+
+#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_range_round_sd(U, A, B, C, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)); })
+
+#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_reduce_pd(A, B) __extension__ ({             \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_reduce_pd(U, A, B) __extension__ ({  \
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_reduce_ps(A, B) __extension__ ({              \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_reduce_ps(W, U, A, B) __extension__ ({   \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_reduce_ps(U, A, B) __extension__ ({       \
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_reduce_round_pd(A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_reduce_round_pd(W, U, A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_reduce_round_pd(U, A, B, R) __extension__ ({\
+  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_reduce_round_ps(A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_reduce_round_ps(W, U, A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_reduce_round_ps(U, A, B, R) __extension__ ({\
+  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm_reduce_ss(A, B, C) __extension__ ({              \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_reduce_ss(W, U, A, B, C) __extension__ ({   \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                       (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_reduce_ss(U, A, B, C) __extension__ ({       \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), \
+                                       (__mmask8)(U), (int)(C), \
+                                       _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_reduce_round_ss(A, B, C, R) __extension__ ({              \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                       (int)(C), (int)(R)); })
+
+#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) __extension__ ({   \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                       (int)(C), (int)(R)); })
+
+#define _mm_maskz_reduce_round_ss(U, A, B, C, R) __extension__ ({       \
+  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                       (__v4sf)(__m128)(B), \
+                                       (__v4sf)_mm_setzero_ps(), \
+                                       (__mmask8)(U), (int)(C), (int)(R)); })
+
+#define _mm_reduce_sd(A, B, C) __extension__ ({              \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)-1, (int)(C), \
+                                        _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_reduce_sd(W, U, A, B, C) __extension__ ({   \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_reduce_sd(U, A, B, C) __extension__ ({       \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)(U), (int)(C), \
+                                        _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_reduce_round_sd(A, B, C, R) __extension__ ({              \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)-1, (int)(C), (int)(R)); })
+
+#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) __extension__ ({   \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                        (int)(C), (int)(R)); })
+
+#define _mm_maskz_reduce_round_sd(U, A, B, C, R) __extension__ ({       \
+  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                        (__v2df)(__m128d)(B), \
+                                        (__v2df)_mm_setzero_pd(), \
+                                        (__mmask8)(U), (int)(C), (int)(R)); })
+                     
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_movepi32_mask (__m512i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_movm_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_movepi64_mask (__m512i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
+}
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x2 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_undefined_ps(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)
+                __O, __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x8 (__m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                _mm512_undefined_ps(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)__O,
+                __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                (__v16sf)_mm512_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x2 (__m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_undefined_pd(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)
+                 __O, __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                 (__v8df)_mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                 (__v16si)_mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x8 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)_mm512_setzero_si512(),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)__O,
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x2 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)
+                 __O, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                 (__v8di)_mm512_setzero_si512 (),
+                 __M);
+}
+
+#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v16sf)(__m512)(A),           \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                  ((imm) & 1) ?  8 : 0,           \
+                                  ((imm) & 1) ?  9 : 1,           \
+                                  ((imm) & 1) ? 10 : 2,           \
+                                  ((imm) & 1) ? 11 : 3,           \
+                                  ((imm) & 1) ? 12 : 4,           \
+                                  ((imm) & 1) ? 13 : 5,           \
+                                  ((imm) & 1) ? 14 : 6,           \
+                                  ((imm) & 1) ? 15 : 7); })
+
+#define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                   (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+                                   (__v8sf)(W)); })
+
+#define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                   (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
+                                   (__v8sf)_mm256_setzero_ps()); })
+
+#define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + ((imm) & 0x3) * 2,         \
+                                   1 + ((imm) & 0x3) * 2); })
+
+#define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)(W)); })
+
+#define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)_mm_setzero_pd()); })
+
+#define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   ((imm) & 1) ?  8 : 0,              \
+                                   ((imm) & 1) ?  9 : 1,              \
+                                   ((imm) & 1) ? 10 : 2,              \
+                                   ((imm) & 1) ? 11 : 3,              \
+                                   ((imm) & 1) ? 12 : 4,              \
+                                   ((imm) & 1) ? 13 : 5,              \
+                                   ((imm) & 1) ? 14 : 6,              \
+                                   ((imm) & 1) ? 15 : 7); })
+
+#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+                                (__v8si)(W)); })
+
+#define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
+                                (__v8si)_mm256_setzero_si256()); })
+
+#define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A),          \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   0 + ((imm) & 0x3) * 2,           \
+                                   1 + ((imm) & 0x3) * 2); })
+
+#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)(W)); })
+
+#define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)_mm_setzero_di()); })
+
+#define _mm512_insertf32x8(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                  (__v16sf)_mm512_castps256_ps512((__m256)(B)),\
+                                  ((imm) & 0x1) ?  0 : 16, \
+                                  ((imm) & 0x1) ?  1 : 17, \
+                                  ((imm) & 0x1) ?  2 : 18, \
+                                  ((imm) & 0x1) ?  3 : 19, \
+                                  ((imm) & 0x1) ?  4 : 20, \
+                                  ((imm) & 0x1) ?  5 : 21, \
+                                  ((imm) & 0x1) ?  6 : 22, \
+                                  ((imm) & 0x1) ?  7 : 23, \
+                                  ((imm) & 0x1) ? 16 :  8, \
+                                  ((imm) & 0x1) ? 17 :  9, \
+                                  ((imm) & 0x1) ? 18 : 10, \
+                                  ((imm) & 0x1) ? 19 : 11, \
+                                  ((imm) & 0x1) ? 20 : 12, \
+                                  ((imm) & 0x1) ? 21 : 13, \
+                                  ((imm) & 0x1) ? 22 : 14, \
+                                  ((imm) & 0x1) ? 23 : 15); })
+
+#define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)(W)); })
+
+#define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps()); })
+
+#define _mm512_insertf64x2(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                  (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\
+                                  (((imm) & 0x3) == 0) ? 8 : 0, \
+                                  (((imm) & 0x3) == 0) ? 9 : 1, \
+                                  (((imm) & 0x3) == 1) ? 8 : 2, \
+                                  (((imm) & 0x3) == 1) ? 9 : 3, \
+                                  (((imm) & 0x3) == 2) ? 8 : 4, \
+                                  (((imm) & 0x3) == 2) ? 9 : 5, \
+                                  (((imm) & 0x3) == 3) ? 8 : 6, \
+                                  (((imm) & 0x3) == 3) ? 9 : 7); })
+
+#define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)(W)); })
+
+#define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_inserti32x8(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                 (__v16si)_mm512_castsi256_si512((__m256i)(B)),\
+                                 ((imm) & 0x1) ?  0 : 16, \
+                                 ((imm) & 0x1) ?  1 : 17, \
+                                 ((imm) & 0x1) ?  2 : 18, \
+                                 ((imm) & 0x1) ?  3 : 19, \
+                                 ((imm) & 0x1) ?  4 : 20, \
+                                 ((imm) & 0x1) ?  5 : 21, \
+                                 ((imm) & 0x1) ?  6 : 22, \
+                                 ((imm) & 0x1) ?  7 : 23, \
+                                 ((imm) & 0x1) ? 16 :  8, \
+                                 ((imm) & 0x1) ? 17 :  9, \
+                                 ((imm) & 0x1) ? 18 : 10, \
+                                 ((imm) & 0x1) ? 19 : 11, \
+                                 ((imm) & 0x1) ? 20 : 12, \
+                                 ((imm) & 0x1) ? 21 : 13, \
+                                 ((imm) & 0x1) ? 22 : 14, \
+                                 ((imm) & 0x1) ? 23 : 15); })
+
+#define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)(W)); })
+
+#define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512()); })
+
+#define _mm512_inserti64x2(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+                                  (__v8di)_mm512_castsi128_si512((__m128i)(B)),\
+                                  (((imm) & 0x3) == 0) ? 8 : 0, \
+                                  (((imm) & 0x3) == 0) ? 9 : 1, \
+                                  (((imm) & 0x3) == 1) ? 8 : 2, \
+                                  (((imm) & 0x3) == 1) ? 9 : 3, \
+                                  (((imm) & 0x3) == 2) ? 8 : 4, \
+                                  (((imm) & 0x3) == 2) ? 9 : 5, \
+                                  (((imm) & 0x3) == 3) ? 8 : 6, \
+                                  (((imm) & 0x3) == 3) ? 9 : 7); })
+
+#define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)(W)); })
+
+#define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512()); })
+
+#define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                              (int)(imm), (__mmask16)(U)); })
+
+#define _mm512_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                              (int)(imm), (__mmask16)-1); })
+
+#define _mm512_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm512_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_fpclass_sd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_fpclass_sd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                          (__mmask8)(U)); })
+
+#define _mm_fpclass_ss_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_fpclass_ss_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                          (__mmask8)(U)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512erintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512erintrin.h
new file mode 100644
index 000000000..8ff212c42
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512erintrin.h
@@ -0,0 +1,285 @@
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+// exp2a23
+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                      (int)(R)); })
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm512_exp2a23_pd(A) \
+  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                     (int)(R)); })
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask16)(M), (int)(R)); })
+
+#define _mm512_exp2a23_ps(A) \
+  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+// rsqrt28
+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(M), (int)(R)); })
+
+#define _mm512_rsqrt28_pd(A) \
+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(M), (int)(R)); })
+
+#define _mm512_rsqrt28_ps(A) \
+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)(__m128)(S), \
+                                              (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(M), (int)(R)); })
+
+#define _mm_rsqrt28_ss(A, B) \
+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)(__m128d)(S), \
+                                               (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(M), (int)(R)); })
+
+#define _mm_rsqrt28_sd(A, B) \
+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+// rcp28
+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (int)(R)); })
+
+#define _mm512_rcp28_pd(A) \
+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                      (int)(R)); })
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (int)(R)); })
+
+#define _mm512_rcp28_ps(A) \
+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)(__m128)(S), \
+                                            (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)(M), (int)(R)); })
+
+#define _mm_rcp28_ss(A, B) \
+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)(__m128d)(S), \
+                                             (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)(M), (int)(R)); })
+
+#define _mm_rcp28_sd(A, B) \
+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif // __AVX512ERINTRIN_H
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512fintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512fintrin.h
new file mode 100644
index 000000000..e6a7217c8
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512fintrin.h
@@ -0,0 +1,10244 @@
+/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FINTRIN_H
+#define __AVX512FINTRIN_H
+
+typedef char __v64qi __attribute__((__vector_size__(64)));
+typedef short __v32hi __attribute__((__vector_size__(64)));
+typedef double __v8df __attribute__((__vector_size__(64)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef long long __v8di __attribute__((__vector_size__(64)));
+typedef int __v16si __attribute__((__vector_size__(64)));
+
+/* Unsigned types */
+typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
+typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
+typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
+typedef unsigned int __v16su __attribute__((__vector_size__(64)));
+
+typedef float __m512 __attribute__((__vector_size__(64)));
+typedef double __m512d __attribute__((__vector_size__(64)));
+typedef long long __m512i __attribute__((__vector_size__(64)));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+/* Rounding mode macros.  */
+#define _MM_FROUND_TO_NEAREST_INT   0x00
+#define _MM_FROUND_TO_NEG_INF       0x01
+#define _MM_FROUND_TO_POS_INF       0x02
+#define _MM_FROUND_TO_ZERO          0x03
+#define _MM_FROUND_CUR_DIRECTION    0x04
+
+/* Constants for integer comparison predicates */
+typedef enum {
+    _MM_CMPINT_EQ,      /* Equal */
+    _MM_CMPINT_LT,      /* Less than */
+    _MM_CMPINT_LE,      /* Less than or Equal */
+    _MM_CMPINT_UNUSED,
+    _MM_CMPINT_NE,      /* Not Equal */
+    _MM_CMPINT_NLT,     /* Not Less than */
+#define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
+    _MM_CMPINT_NLE      /* Not Less than or Equal */
+#define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
+} _MM_CMPINT_ENUM;
+
+typedef enum
+{
+  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+  _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
+  _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
+  _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
+  _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
+  _MM_MANT_SIGN_zero,   /* sign = 0             */
+  _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+
+/* Create vectors with repeated elements */
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_setzero_si512(void)
+{
+  return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+#define _mm512_setzero_epi32 _mm512_setzero_si512
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_undefined_pd(void)
+{
+  return (__m512d)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined(void)
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_undefined_ps(void)
+{
+  return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_undefined_epi32(void)
+{
+  return (__m512i)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v4si) __A,
+                                          (__v4si)_mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M,
+                                             (__v16si) _mm512_broadcastd_epi32(__A),
+                                             (__v16si) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M,
+                                             (__v16si) _mm512_broadcastd_epi32(__A),
+                                             (__v16si) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+  return (__m512i)__builtin_shufflevector((__v2di) __A,
+                                          (__v2di) _mm_undefined_si128(),
+                                          0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di) _mm512_broadcastq_epi64(__A),
+                                             (__v8di) __O);
+
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di) _mm512_broadcastq_epi64(__A),
+                                             (__v8di) _mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+#ifdef __x86_64__
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#else
+  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
+                 (__v8di)
+                 _mm512_setzero_si512 (),
+                 __M);
+#endif
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_setzero_ps(void)
+{
+  return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+#define _mm512_setzero _mm512_setzero_ps
+
+static  __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_setzero_pd(void)
+{
+  return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_set1_ps(float __w)
+{
+  return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
+                   __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_set1_pd(double __w)
+{
+  return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi8(char __w)
+{
+  return (__m512i)(__v64qi){ __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w  };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi16(short __w)
+{
+  return (__m512i)(__v32hi){ __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w,
+                             __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi32(int __s)
+{
+  return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
+                             __s, __s, __s, __s, __s, __s, __s, __s };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set1_epi64(long long __d)
+{
+  return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcastss_ps(__m128 __A)
+{
+  return (__m512)__builtin_shufflevector((__v4sf) __A,
+                                         (__v4sf)_mm_undefined_ps(),
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
+{
+  return  (__m512i)(__v16si)
+   { __D, __C, __B, __A, __D, __C, __B, __A,
+     __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set4_epi64 (long long __A, long long __B, long long __C,
+       long long __D)
+{
+  return  (__m512i) (__v8di)
+   { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+  return  (__m512d)
+   { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+  return  (__m512)
+   { __D, __C, __B, __A, __D, __C, __B, __A,
+     __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+#define _mm512_setr4_epi32(e0,e1,e2,e3)               \
+  _mm512_set4_epi32((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_epi64(e0,e1,e2,e3)               \
+  _mm512_set4_epi64((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_pd(e0,e1,e2,e3)                \
+  _mm512_set4_pd((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_ps(e0,e1,e2,e3)                \
+  _mm512_set4_ps((e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcastsd_pd(__m128d __A)
+{
+  return (__m512d)__builtin_shufflevector((__v2df) __A,
+                                          (__v2df) _mm_undefined_pd(),
+                                          0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/* Cast between vector types */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm512_castpd512_pd128(__m512d __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm512_castpd512_pd256 (__m512d __A)
+{
+  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm512_castps512_ps128(__m512 __a)
+{
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm512_castps512_ps256 (__m512 __A)
+{
+  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castpd_ps (__m512d __A)
+{
+  return (__m512) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_castpd_si512 (__m512d __A)
+{
+  return (__m512i) (__A);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_castpd128_pd512 (__m128d __A)
+{
+  return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castps_pd (__m512 __A)
+{
+  return (__m512d) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_castps_si512 (__m512 __A)
+{
+  return (__m512i) (__A);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_castps128_ps512 (__m128 __A)
+{
+    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi128_si512 (__m128i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_castsi256_si512 (__m256i __A)
+{
+   return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_castsi512_ps (__m512i __A)
+{
+  return (__m512) (__A);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_castsi512_pd (__m512i __A)
+{
+  return (__m512d) (__A);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm512_castsi512_si128 (__m512i __A)
+{
+  return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_castsi512_si256 (__m512i __A)
+{
+  return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_int2mask(int __a)
+{
+  return (__mmask16)__a;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask2int(__mmask16 __a)
+{
+  return (int)__a;
+}
+
+/* Bitwise operators */
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi32(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v16su)__a & (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                (__v16si) _mm512_and_epi32(__a, __b),
+                (__v16si) __src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
+                                         __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_epi64(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a & (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+    return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
+                (__v8di) _mm512_and_epi64(__a, __b),
+                (__v8di) __src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
+                                         __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i)(~(__v16su)(__A) & (__v16su)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_andnot_epi32(__A, __B),
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_andnot_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_andnot_epi64(__A, __B),
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi32(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v16su)__a | (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                                             (__v16si)_mm512_or_epi32(__a, __b),
+                                             (__v16si)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_epi64(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a | (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+                                             (__v8di)_mm512_or_epi64(__a, __b),
+                                             (__v8di)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi32(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v16su)__a ^ (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+                                            (__v16si)_mm512_xor_epi32(__a, __b),
+                                            (__v16si)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_epi64(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a ^ (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+                                             (__v8di)_mm512_xor_epi64(__a, __b),
+                                             (__v8di)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+  return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_and_si512(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a & (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_or_si512(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a | (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_xor_si512(__m512i __a, __m512i __b)
+{
+  return (__m512i)((__v8du)__a ^ (__v8du)__b);
+}
+
+/* Arithmetic */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_add_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a + (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_add_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a + (__v16sf)__b);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mul_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a * (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mul_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a * (__v16sf)__b);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_sub_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a - (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_sub_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a - (__v16sf)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A + (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_add_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A - (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sub_epi64(__A, __B),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A + (__v16su) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_add_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A - (__v16su) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_sub_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_max_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_max_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_undefined_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_max_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_max_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_max_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+#define _mm512_max_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_undefined_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_max_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_max_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_max_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_max_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_max_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_max_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_max_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+#define _mm512_mask_min_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_min_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_min_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_undefined_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_min_pd(__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+             (__v8df) __B,
+             (__v8df)
+             _mm512_setzero_pd (),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_min_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_min_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+#define _mm512_min_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_undefined_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                  (__v8df) __B,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_min_ps(__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+            (__v16sf) __B,
+            (__v16sf)
+            _mm512_setzero_ps (),
+            (__mmask16) -1,
+            _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                 (__v16sf) __B,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_min_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_min_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_min_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_min_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS
+_mm512_min_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu32(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                   (__v16si) __B,
+                   (__v16si)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_min_epu64(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di) __W, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                   (__v8di) __B,
+                   (__v8di)
+                   _mm512_setzero_si512 (),
+                   __M);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)__W);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epi32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mul_epu32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)__W);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                             (__v8di)_mm512_mul_epu32(__X, __Y),
+                                             (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A * (__v16su) __B);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_mullo_epi32(__A, __B),
+                                             (__v16si)__W);
+}
+
+#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_sqrt_round_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm512_sqrt_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_undefined_pd(), \
+                                         (__mmask8)-1, (int)(R)); })
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_sqrt_pd(__m512d __a)
+{
+  return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
+                                                (__v8df) _mm512_setzero_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                   (__v8df) __W,
+                   (__mmask8) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                   (__v8df)
+                   _mm512_setzero_pd (),
+                   (__mmask8) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_sqrt_round_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_sqrt_round_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(U), (int)(R)); })
+
+#define _mm512_sqrt_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_undefined_ps(), \
+                                        (__mmask16)-1, (int)(R)); })
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_sqrt_ps(__m512 __a)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                 (__v8df)
+                 _mm512_setzero_pd (),
+                 (__mmask8) -1);}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rsqrt14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+             (__v4sf) __B,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
+              (__v2df) __B,
+              (__v2df)
+              _mm_setzero_pd (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
+static  __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_rcp14_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+               (__v8df)
+               _mm512_setzero_pd (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static  __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_rcp14_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+              (__v16sf)
+              _mm512_setzero_ps (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                   (__v16sf)
+                   _mm512_setzero_ps (),
+                   (__mmask16) __U);
+}
+
+static  __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ss(__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+                 (__v4sf) __B,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_ps (),
+          (__mmask8) __U);
+}
+
+static  __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_sd(__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
+            (__v2df) __B,
+            (__v2df)
+            _mm_setzero_pd (),
+            (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_floor_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_FLOOR,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                   _MM_FROUND_FLOOR,
+                   (__v16sf) __W, __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_floor_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_FLOOR,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                _MM_FROUND_FLOOR,
+                (__v8df) __W, __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                   _MM_FROUND_CEIL,
+                   (__v16sf) __W, __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_ceil_ps(__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                  _MM_FROUND_CEIL,
+                                                  (__v16sf) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_ceil_pd(__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                   _MM_FROUND_CEIL,
+                                                   (__v8df) __A, -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                _MM_FROUND_CEIL,
+                (__v8df) __W, __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi64(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+             (__v8di)
+             _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_abs_epi32(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+             (__v16si)
+             _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_add_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_add_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_add_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_add_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_add_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_add_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_add_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_add_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_add_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_add_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_add_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_add_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_add_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_sub_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_sub_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_sub_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_sub_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_sub_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_sub_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_sub_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_sub_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_sub_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_sub_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_sub_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_sub_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
+
+#define _mm512_maskz_sub_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#define _mm_mul_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_mul_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_mul_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mul_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_mul_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_mul_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_mul_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_mul_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_mul_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_mul_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_mul_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_mul_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
+
+#define _mm512_maskz_mul_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf)  _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_div_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm_maskz_div_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)  _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_div_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_div_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_div_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_div_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)((__v8df)__a/(__v8df)__b);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_div_pd(__A, __B),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_div_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)((__v16sf)__a/(__v16sf)__b);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_div_ps(__A, __B),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_div_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_div_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(B), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(U), (int)(R)); })
+
+#define _mm512_div_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_div_round_ps(W, U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                       (int)(R)); });
+
+#define _mm512_maskz_div_round_ps(U, A, B, R)  __extension__ ({ \
+  (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(B), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(U), (int)(R)); });
+
+#define _mm512_roundscale_ps(A, B) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
+                                         (__v16sf)(__m512)(A), (__mmask16)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_ps(A, B, C, imm) __extension__ ({\
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_roundscale_ps(A, B, imm) __extension__ ({\
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(A), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(A), (int)(R)); })
+
+#define _mm512_roundscale_round_ps(A, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                         (__v16sf)_mm512_undefined_ps(), \
+                                         (__mmask16)-1, (int)(R)); })
+
+#define _mm512_roundscale_pd(A, B) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
+                                          (__v8df)(__m512d)(A), (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_pd(A, B, C, imm) __extension__ ({\
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_roundscale_pd(A, B, imm) __extension__ ({\
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(A), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(A), (int)(R)); })
+
+#define _mm512_roundscale_round_pd(A, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                          (__v8df)_mm512_undefined_pd(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
+                                           (int)(R)); })
+
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(C), (__mmask8)-1, \
+                                           (int)(R)); })
+
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           -(__v8df)(__m512d)(C), \
+                                           (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+                                                    (__v8df) __B,
+                                                    -(__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+                                                     (__v8df) __B,
+                                                     -(__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
+                                          (int)(R)); })
+
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(C), (__mmask16)-1, \
+                                          (int)(R)); })
+
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          -(__v16sf)(__m512)(C), \
+                                          (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   -(__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    -(__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              -(__v8df)(__m512d)(C), \
+                                              (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               -(__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) -1,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        -(__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)-1, (int)(R)); })
+
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             -(__v16sf)(__m512)(C), \
+                                             (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              -(__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       -(__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__v8df) __C,
+                                                        (__mmask8) __U,
+                                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__v16sf) __C,
+                                                       (__mmask16) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)); })
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)); })
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+
+
+/* Vector permutations */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16si) __A,
+                                                       (__v16si) __B,
+                                                       (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
+                                __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16si) __A,
+                                                        (__v16si) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
+                                 __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16si) __A,
+                                                        (__v16si) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
+                                __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) __U);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
+                                                        /* idx */ ,
+                                                        (__v8di) __A,
+                                                        (__v8di) __B,
+                                                        (__mmask8) __U);
+}
+
+#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
+                                   (__v8di)(__m512i)(A), \
+                                   ((int)(I) & 0x7) + 0, \
+                                   ((int)(I) & 0x7) + 1, \
+                                   ((int)(I) & 0x7) + 2, \
+                                   ((int)(I) & 0x7) + 3, \
+                                   ((int)(I) & 0x7) + 4, \
+                                   ((int)(I) & 0x7) + 5, \
+                                   ((int)(I) & 0x7) + 6, \
+                                   ((int)(I) & 0x7) + 7); })
+
+#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)(__m512i)(W)); })
+
+#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                 (__v8di)_mm512_setzero_si512()); })
+
+#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
+                                   (__v16si)(__m512i)(A), \
+                                   ((int)(I) & 0xf) + 0, \
+                                   ((int)(I) & 0xf) + 1, \
+                                   ((int)(I) & 0xf) + 2, \
+                                   ((int)(I) & 0xf) + 3, \
+                                   ((int)(I) & 0xf) + 4, \
+                                   ((int)(I) & 0xf) + 5, \
+                                   ((int)(I) & 0xf) + 6, \
+                                   ((int)(I) & 0xf) + 7, \
+                                   ((int)(I) & 0xf) + 8, \
+                                   ((int)(I) & 0xf) + 9, \
+                                   ((int)(I) & 0xf) + 10, \
+                                   ((int)(I) & 0xf) + 11, \
+                                   ((int)(I) & 0xf) + 12, \
+                                   ((int)(I) & 0xf) + 13, \
+                                   ((int)(I) & 0xf) + 14, \
+                                   ((int)(I) & 0xf) + 15); })
+
+#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)(__m512i)(W)); })
+
+#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                (__v16si)_mm512_setzero_si512()); })
+/* Vector Extract */
+
+#define _mm512_extractf64x4_pd(A, I) __extension__ ({             \
+  (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   ((I) & 1) ? 4 : 0,             \
+                                   ((I) & 1) ? 5 : 1,             \
+                                   ((I) & 1) ? 6 : 2,             \
+                                   ((I) & 1) ? 7 : 3); })
+
+#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                   (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+                                   (__v4df)(W)); })
+
+#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                   (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
+                                   (__v4df)_mm256_setzero_pd()); })
+
+#define _mm512_extractf32x4_ps(A, I) __extension__ ({             \
+  (__m128)__builtin_shufflevector((__v16sf)(__m512)(A),           \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                  0 + ((I) & 0x3) * 4,            \
+                                  1 + ((I) & 0x3) * 4,            \
+                                  2 + ((I) & 0x3) * 4,            \
+                                  3 + ((I) & 0x3) * 4); })
+
+#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)(W)); })
+
+#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)_mm_setzero_ps()); })
+
+/* Vector Blend */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+                 (__v8df) __W,
+                 (__v8df) __A);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+                (__v16sf) __W,
+                (__v16sf) __A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                (__v8di) __W,
+                (__v8di) __A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                (__v16si) __W,
+                (__v16si) __A);
+}
+
+/* Compare */
+
+#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(P), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(P), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_cmp_ps_mask(A, B, P) \
+  _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmpeq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_ps_mask(A, B) \
+    _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_ps_mask(k, A, B) \
+    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), (int)(P), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(B), (int)(P), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm512_cmp_pd_mask(A, B, P) \
+  _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
+  _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmpeq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_pd_mask(A, B) \
+    _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_pd_mask(k, A, B) \
+    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
+
+/* Conversion */
+
+#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_undefined_epi32(), \
+                                             (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)(__m512i)(W), \
+                                             (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_setzero_si512(), \
+                                             (__mmask16)(U), (int)(R)); })
+
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epu32(__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                   (__v16si) __W,
+                   (__mmask16) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                   (__v16si) _mm512_setzero_si512 (),
+                   (__mmask16) __U,
+                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) _mm512_undefined_ps (),
+                 (__mmask16) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                 (__v16sf) _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_pd(__m256i __A)
+{
+  return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepi32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepi32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) _mm512_undefined_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) __W,
+                (__mmask16) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                (__v16sf) _mm512_setzero_ps (),
+                (__mmask16) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_pd(__m256i __A)
+{
+  return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                              (__v8df)_mm512_cvtepu32_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtepu32lo_pd(__m512i __A)
+{
+  return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+  return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                          (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_ps(U, A, R) __extension__ ({ \
+  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_ps (__m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) _mm256_undefined_ps (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpd_pslo (__m512d __A)
+{
+  return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
+{
+  return (__m512) __builtin_shufflevector (
+                (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
+                                               __U, __A),
+                (__v8sf) _mm256_setzero_ps (),
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_undefined_si256(), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)(__m256i)(U), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_maskz_cvt_roundps_ph(W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_cvtps_ph(A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_cvtps_ph(U, W, A, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)(__m256i)(U), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_maskz_cvtps_ph(W, A, I) __extension__ ({\
+  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v16hi)_mm256_setzero_si256(), \
+                                            (__mmask16)(W)); })
+
+#define _mm512_cvt_roundph_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundph_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundph_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+
+static  __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtph_ps(__m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                (__v16sf)
+                _mm512_setzero_ps (),
+                (__mmask16) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                 (__v16sf) _mm512_setzero_ps (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)(__m256i)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epi32(__m512d __a)
+{
+  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
+                                                   (__v8si)_mm256_setzero_si256(),
+                                                   (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                  (__v8si) _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)(__m512i)(W), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)(U), (int)(R)); })
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_cvttps_epi32(__m512 __a)
+{
+  return (__m512i)
+    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
+                                     (__v16si) _mm512_setzero_si512 (),
+                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                  (__v16si) _mm512_setzero_si512 (),
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)(__m512i)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epi32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                           (__v16si)_mm512_setzero_si512(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epi32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si) _mm512_undefined_epi32 (),
+                 (__mmask16) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                 (__v16si)
+                 _mm512_setzero_si512 (),
+                 (__mmask16) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)(__m256i)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                           (__v8si)_mm256_setzero_si256(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epi32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si)
+                 _mm256_undefined_si256 (),
+                 (__mmask8) -1,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)(__m512i)(W), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_epu32(U, A, R) __extension__ ({ \
+  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtps_epu32 ( __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
+                  (__v16si)\
+                  _mm512_undefined_epi32 (),\
+                  (__mmask16) -1,\
+                  _MM_FROUND_CUR_DIRECTION);\
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                  (__v16si) 
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U ,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_undefined_si256 (),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+/* Unpack and Interleave */
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpackhi_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
+                                           (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
+                                           (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_unpacklo_pd(__m512d __a, __m512d __b)
+{
+  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
+                                           (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
+                                           (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpackhi_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+                                         2,    18,    3,    19,
+                                         2+4,  18+4,  3+4,  19+4,
+                                         2+8,  18+8,  3+8,  19+8,
+                                         2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
+                                          (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
+                                          (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_unpacklo_ps(__m512 __a, __m512 __b)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+                                         0,    16,    1,    17,
+                                         0+4,  16+4,  1+4,  17+4,
+                                         0+8,  16+8,  1+8,  17+8,
+                                         0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
+                                          (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
+                                          (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+                                          2,    18,    3,    19,
+                                          2+4,  18+4,  3+4,  19+4,
+                                          2+8,  18+8,  3+8,  19+8,
+                                          2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
+                                       (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
+                                       (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+                                          0,    16,    1,    17,
+                                          0+4,  16+4,  1+4,  17+4,
+                                          0+8,  16+8,  1+8,  17+8,
+                                          0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
+                                       (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
+                                       (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
+                                        (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
+                                        (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
+                                        (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
+                                        (__v8di)_mm512_setzero_si512());
+}
+
+/* Bit Test */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_test_epi32_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+            (__v16si) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+                 (__v16si) __B, __U);
+}
+
+static __inline __mmask8 __DEFAULT_FN_ATTRS
+_mm512_test_epi64_mask(__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+                 (__v8di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
+}
+
+
+/* SIMD load ops */
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_loadu_si512 (void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) -1);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m512d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m512 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_load_ps(float const *__p)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_load_pd(double const *__p)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+                          (__v8df) __W,
+                          (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_si512 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_epi32 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_load_epi64 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+/* SIMD store ops */
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
+                                     (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
+            (__mmask16) -1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
+                                     (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_pd(void *__P, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_storeu_ps(void *__P, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)-1);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_pd(void *__P, __m512d __A)
+{
+  *(__m512d*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
+                                   (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_ps(void *__P, __m512 __A)
+{
+  *(__m512*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_si512 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_epi32 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS
+_mm512_store_epi64 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+/* Mask ops */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_knot(__mmask16 __M)
+{
+  return __builtin_ia32_knothi(__M);
+}
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
+  return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi32(__m128i __A)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepi8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi8_epi64(__m128i __A)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi64(__m256i __X)
+{
+  return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi32(__m256i __A)
+{
+  return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepi16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepi16_epi64(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepi16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi32(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_cvtepu8_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu8_epi64(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu8_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu32_epi64(__m256i __X)
+{
+  return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu32_epi64(__X),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu16_epi32(__m256i __A)
+{
+  return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_cvtepu16_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_cvtepu16_epi64(__m128i __A)
+{
+  return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_cvtepu16_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rorv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rorv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+
+
+#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (int)(p), \
+                                         (__mmask16)-1); })
+
+#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
+                                          (__mmask16)-1); })
+
+#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                         (__v16si)(__m512i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
+
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                        (__v8di)(__m512i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm512_rol_epi32(a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)-1); })
+
+#define _mm512_mask_rol_epi32(W, U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)(__m512i)(W), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_maskz_rol_epi32(U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_rol_epi64(a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)-1); })
+
+#define _mm512_mask_rol_epi64(W, U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+
+#define _mm512_maskz_rol_epi64(U, a, b) __extension__ ({ \
+  (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)(U)); })
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rolv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+              (__v16si) __B,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_rolv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+              (__v8di) __B,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+#define _mm512_ror_epi32(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)-1); })
+
+#define _mm512_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)(__m512i)(W), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
+                                        (__v16si)_mm512_setzero_si512(), \
+                                        (__mmask16)(U)); })
+
+#define _mm512_ror_epi64(A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)-1); })
+
+#define _mm512_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)(__m512i)(W), (__mmask8)(U)); })
+
+#define _mm512_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
+                                        (__v8di)_mm512_setzero_si512(), \
+                                        (__mmask8)(U)); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_slli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_slli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_slli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                         (__v16si)_mm512_srli_epi32(__A, __B),
+                                         (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srli_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                          (__v8di)_mm512_srli_epi64(__A, __B),
+                                          (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+              (__v16si)
+              _mm512_setzero_si512 (),
+              (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
+          (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                 (__v16si) __A,
+                 (__v16si) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+                 (__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                 (__v8di) __A,
+                 (__v8di) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+                 (__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+              (__v8di)
+              _mm512_setzero_si512 (),
+              (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_movedup_pd (__m512d __A)
+{
+  return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
+                                          0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_movedup_pd(__A),
+                                              (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                              (__v8df)_mm512_movedup_pd(__A),
+                                              (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_fixupimm_round_pd(A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)-1, \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), \
+                                              (int)(imm), (__mmask8)(U), \
+                                              (int)(R)); })
+
+#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), \
+                                              (int)(imm), (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_fixupimm_round_ps(A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U), (int)(R)); })
+
+#define _mm512_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1, \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U), \
+                                             (int)(R)); })
+
+#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_fixupimm_round_sd(A, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm_fixupimm_sd(A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_fixupimm_sd(A, U, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2di)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_fixupimm_round_ss(A, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)(U), (int)(R)); })
+
+#define _mm_fixupimm_ss(A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)-1, \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_fixupimm_ss(A, U, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4si)(__m128i)(C), (int)(imm), \
+                                         (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_getexp_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)-1, (int)(R)); })
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
+                 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)(__m128d)(W), \
+                                                 (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sd(U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U), (int)(R)); })
+
+#define _mm_getexp_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+                (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __W,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)(__m128)(W), \
+                                                (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) _mm_setzero_pd (),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_ss(U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)(U), (int)(R)); })
+
+#define _mm_getmant_round_sd(A, B, C, D, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)); })
+
+#define _mm_getmant_sd(A, B, C, D)  __extension__ ({ \
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_sd(W, U, A, B, C, D) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R)({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_getmant_sd(U, A, B, C, D) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) __extension__ ({\
+  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), (int)(R)); })
+
+#define _mm_getmant_round_ss(A, B, C, D, R) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_getmant_ss(A, B, C, D) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_ss(W, U, A, B, C, D) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R)({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_getmant_ss(U, A, B, C, D) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_pd(), \
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) __extension__ ({\
+  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (int)(((D)<<2) | (C)), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kmov (__mmask16 __A)
+{
+  return  __A;
+}
+
+#define _mm_comi_round_sd(A, B, P, R) __extension__ ({\
+  (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
+                              (int)(P), (int)(R)); })
+
+#define _mm_comi_round_ss(A, B, P, R) __extension__ ({\
+  (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
+                              (int)(P), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
+         __mmask16 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16si) __B,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi32(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sll_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sll_epi64(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_sll_epi64(__A, __B),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sll_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sllv_epi64(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi32(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_sra_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_sra_epi64(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_sra_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srav_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srav_epi64(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srav_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi32(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                          (__v16si)_mm512_srl_epi32(__A, __B),
+                                          (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srl_epi64(__m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                           (__v8di)_mm512_srl_epi64(__A, __B),
+                                           (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi32(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
+                                           (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
+                                            (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                            (__v16si)(__m512i)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)-1); })
+
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                            (__v16si)(__m512i)(B), \
+                                            (__v16si)(__m512i)(C), (int)(imm), \
+                                            (__mmask16)(U)); })
+
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
+                                             (__v16si)(__m512i)(B), \
+                                             (__v16si)(__m512i)(C), \
+                                             (int)(imm), (__mmask16)(U)); })
+
+#define _mm512_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+                                            (__v8di)(__m512i)(B), \
+                                            (__v8di)(__m512i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+                                            (__v8di)(__m512i)(B), \
+                                            (__v8di)(__m512i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
+                                             (__v8di)(__m512i)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+#endif
+
+#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvt_roundsd_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvt_roundsd_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
+                                                  (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
+#endif
+
+#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvtss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
+                                                  (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvtss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+                 __A,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvtt_roundsd_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_i32 (__m128d __A)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+
+#define _mm_cvtt_roundsd_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_i64 (__m128d __A)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
+                                                   (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvtt_roundss_si32(A, R) __extension__ ({ \
+  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_i32 (__m128 __A)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+#define _mm_cvtt_roundss_si64(A, R) __extension__ ({ \
+  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_i64 (__m128 __A)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
+  (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS
+_mm_cvttss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
+  (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
+                                                   (int)(R)); })
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_cvttss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+                  __A,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
+              (__v8di) __I
+              /* idx */ ,
+              (__v8df) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
+                   (__v16si) __I
+                   /* idx */ ,
+                   (__v16sf) __B,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
+         __mmask8 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
+                   (__v8di) __I
+                   /* idx */ ,
+                   (__v8di) __B,
+                   (__mmask8) __U);
+}
+
+#define _mm512_permute_pd(X, C) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1), \
+                                   4 + (((C) >> 4) & 0x1), \
+                                   4 + (((C) >> 5) & 0x1), \
+                                   6 + (((C) >> 6) & 0x1), \
+                                   6 + (((C) >> 7) & 0x1)); })
+
+#define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permute_pd((X), (C)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permute_pd((X), (C)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_permute_ps(X, C) __extension__ ({ \
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
+                                  (__v16sf)_mm512_undefined_ps(), \
+                                   0  + (((C) >> 0) & 0x3), \
+                                   0  + (((C) >> 2) & 0x3), \
+                                   0  + (((C) >> 4) & 0x3), \
+                                   0  + (((C) >> 6) & 0x3), \
+                                   4  + (((C) >> 0) & 0x3), \
+                                   4  + (((C) >> 2) & 0x3), \
+                                   4  + (((C) >> 4) & 0x3), \
+                                   4  + (((C) >> 6) & 0x3), \
+                                   8  + (((C) >> 0) & 0x3), \
+                                   8  + (((C) >> 2) & 0x3), \
+                                   8  + (((C) >> 4) & 0x3), \
+                                   8  + (((C) >> 6) & 0x3), \
+                                   12 + (((C) >> 0) & 0x3), \
+                                   12 + (((C) >> 2) & 0x3), \
+                                   12 + (((C) >> 4) & 0x3), \
+                                   12 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_permute_ps((X), (C)), \
+                                      (__v16sf)(__m512)(W)); })
+
+#define _mm512_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_permute_ps((X), (C)), \
+                                      (__v16sf)_mm512_setzero_ps()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutevar_pd(__m512d __A, __m512i __C)
+{
+  return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+                                         (__v8df)_mm512_permutevar_pd(__A, __C),
+                                         (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutevar_ps(__m512 __A, __m512i __C)
+{
+  return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
+                                        (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                    /* idx */ ,
+                    (__v8df) __A,
+                    (__v8df) __B,
+                    (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                    /* idx */ ,
+                    (__v8df) __A,
+                    (__v8df) __B,
+                    (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
+            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
+                                                         /* idx */ ,
+                                                         (__v8df) __A,
+                                                         (__v8df) __B,
+                                                         (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                         /* idx */ ,
+                                                         (__v16sf) __A,
+                                                         (__v16sf) __B,
+                                                         (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                         /* idx */ ,
+                                                         (__v16sf) __A,
+                                                         (__v16sf) __B,
+                                                         (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
+            __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
+                                                        /* idx */ ,
+                                                        (__v16sf) __A,
+                                                        (__v16sf) __B,
+                                                        (__mmask16) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+             (__v16si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+            (__v8di) __B, __U);
+}
+
+#define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_undefined_si256(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)(__m256i)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) __extension__ ({ \
+  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_undefined_si256 (),
+                  (__mmask8) -1,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U,
+                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_roundscale_round_sd(A, B, imm, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                (int)(R)); })
+
+#define _mm_roundscale_sd(A, B, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_sd(W, U, A, B, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), (int)(imm), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)); })
+
+#define _mm_maskz_roundscale_sd(U, A, B, I) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)); })
+
+#define _mm_roundscale_round_ss(A, B, imm, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(imm), \
+                                               (int)(R)); })
+
+#define _mm_roundscale_ss(A, B, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(imm), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_ss(W, U, A, B, I) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), (int)(I), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), (int)(I), \
+                                               (int)(R)); })
+
+#define _mm_maskz_roundscale_ss(U, A, B, I) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(I), \
+                                               _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(I), \
+                                               (int)(R)); })
+
+#define _mm512_scalef_round_pd(A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                (__v8df) __B,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ps(A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, R) __extension__ ({ \
+  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+               (__v16sf) __B,
+               (__v16sf)
+               _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_scalef_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
+              (__v2df)( __B), (__v2df) _mm_setzero_pd(),
+              (__mmask8) -1,
+              _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_scalef_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
+             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
+             (__mmask8) -1,
+             _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi32(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+                                         (__v16si)_mm512_srai_epi32(__A, __B), \
+                                         (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
+                                         (__v16si)_mm512_srai_epi32(__A, __B), \
+                                         (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_srai_epi64(__m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+                                          (__v8di)_mm512_srai_epi64(__A, __B), \
+                                          (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
+                                          (__v8di)_mm512_srai_epi64(__A, __B), \
+                                          (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_undefined_ps(), \
+                                         (__mmask16)-1); })
+
+#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)(__m512)(W), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(B), (int)(imm), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(U)); })
+
+#define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_undefined_pd(), \
+                                          (__mmask8)-1); })
+
+#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)(__m512d)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(imm), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)_mm512_setzero_si512(), \
+                                          (__mmask16)-1); })
+
+#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)(__m512i)(W), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
+                                          (__v16si)(__m512i)(B), (int)(imm), \
+                                          (__v16si)_mm512_setzero_si512(), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)_mm512_setzero_si512(), \
+                                          (__mmask8)-1); })
+
+#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)(__m512i)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
+                                          (__v8di)(__m512i)(B), (int)(imm), \
+                                          (__v8di)_mm512_setzero_si512(), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_shuffle_pd(A, B, M) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                   (__v8df)(__m512d)(B), \
+                                   0  + (((M) >> 0) & 0x1), \
+                                   8  + (((M) >> 1) & 0x1), \
+                                   2  + (((M) >> 2) & 0x1), \
+                                   10 + (((M) >> 3) & 0x1), \
+                                   4  + (((M) >> 4) & 0x1), \
+                                   12 + (((M) >> 5) & 0x1), \
+                                   6  + (((M) >> 6) & 0x1), \
+                                   14 + (((M) >> 7) & 0x1)); })
+
+#define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_shuffle_ps(A, B, M) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), \
+                                   0  + (((M) >> 0) & 0x3), \
+                                   0  + (((M) >> 2) & 0x3), \
+                                   16 + (((M) >> 4) & 0x3), \
+                                   16 + (((M) >> 6) & 0x3), \
+                                   4  + (((M) >> 0) & 0x3), \
+                                   4  + (((M) >> 2) & 0x3), \
+                                   20 + (((M) >> 4) & 0x3), \
+                                   20 + (((M) >> 6) & 0x3), \
+                                   8  + (((M) >> 0) & 0x3), \
+                                   8  + (((M) >> 2) & 0x3), \
+                                   24 + (((M) >> 4) & 0x3), \
+                                   24 + (((M) >> 6) & 0x3), \
+                                   12 + (((M) >> 0) & 0x3), \
+                                   12 + (((M) >> 2) & 0x3), \
+                                   28 + (((M) >> 4) & 0x3), \
+                                   28 + (((M) >> 6) & 0x3)); })
+
+#define _mm512_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                      (__v16sf)(__m512)(W)); })
+
+#define _mm512_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                      (__v16sf)_mm512_setzero_ps()); })
+
+#define _mm_sqrt_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+                 (__v2df) __B,
+                (__v2df) _mm_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm_sqrt_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+                 (__v4sf) __B,
+                (__v4sf) _mm_setzero_ps (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_broadcast_f32x4 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_undefined_ps (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf) __O,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_broadcast_f64x4 (__m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_undefined_pd (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df) __O,
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i32x4 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_undefined_epi32 (),
+                  (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcast_i64x4 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_undefined_epi32 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di) __O,
+                  __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  __M);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512(__M,
+                                              (__v8df) _mm512_broadcastsd_pd(__A),
+                                              (__v8df) __O);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m512d)__builtin_ia32_selectpd_512(__M,
+                                              (__v8df) _mm512_broadcastsd_pd(__A),
+                                              (__v8df) _mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512(__M,
+                                             (__v16sf) _mm512_broadcastss_ps(__A),
+                                             (__v16sf) __O);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512(__M,
+                                             (__v16sf) _mm512_broadcastss_ps(__A),
+                                             (__v16sf) _mm512_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_undefined_si256 (),
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+               (__v16hi) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+               (__v8si) _mm256_setzero_si256 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_undefined_si256 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) __O,
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                (__v16hi) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                (__v8si) _mm256_setzero_si256 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_undefined_si128 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_undefined_si256 (),
+              (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+              (__v16hi) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_undefined_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+              (__v8si) _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_undefined_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({            \
+  (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   0 + ((imm) & 0x3) * 4,             \
+                                   1 + ((imm) & 0x3) * 4,             \
+                                   2 + ((imm) & 0x3) * 4,             \
+                                   3 + ((imm) & 0x3) * 4); })
+
+#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+                                (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)__W); })
+
+#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+                                (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)_mm_setzero_si128()); })
+
+#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({           \
+  (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A),             \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   ((imm) & 1) ? 4 : 0,              \
+                                   ((imm) & 1) ? 5 : 1,              \
+                                   ((imm) & 1) ? 6 : 2,              \
+                                   ((imm) & 1) ? 7 : 3); })
+
+#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+                                (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+                                (__v4di)__W); })
+
+#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+                                (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
+                                (__v4di)_mm256_setzero_si256()); })
+
+#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
+                                 (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)(W)); })
+
+#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                  (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
+                                 (__v8di)_mm512_castsi256_si512((__m256i)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)(W)); })
+
+#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512()); })
+
+#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
+  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
+                                  (__v16sf)_mm512_castps128_ps512((__m128)(B)),\
+                                  (((imm) & 0x3) == 0) ? 16 :  0, \
+                                  (((imm) & 0x3) == 0) ? 17 :  1, \
+                                  (((imm) & 0x3) == 0) ? 18 :  2, \
+                                  (((imm) & 0x3) == 0) ? 19 :  3, \
+                                  (((imm) & 0x3) == 1) ? 16 :  4, \
+                                  (((imm) & 0x3) == 1) ? 17 :  5, \
+                                  (((imm) & 0x3) == 1) ? 18 :  6, \
+                                  (((imm) & 0x3) == 1) ? 19 :  7, \
+                                  (((imm) & 0x3) == 2) ? 16 :  8, \
+                                  (((imm) & 0x3) == 2) ? 17 :  9, \
+                                  (((imm) & 0x3) == 2) ? 18 : 10, \
+                                  (((imm) & 0x3) == 2) ? 19 : 11, \
+                                  (((imm) & 0x3) == 3) ? 16 : 12, \
+                                  (((imm) & 0x3) == 3) ? 17 : 13, \
+                                  (((imm) & 0x3) == 3) ? 18 : 14, \
+                                  (((imm) & 0x3) == 3) ? 19 : 15); })
+
+#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)(W)); })
+
+#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
+  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                 (__v16sf)_mm512_setzero_ps()); })
+
+#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                 (__v16si)_mm512_castsi128_si512((__m128i)(B)),\
+                                 (((imm) & 0x3) == 0) ? 16 :  0, \
+                                 (((imm) & 0x3) == 0) ? 17 :  1, \
+                                 (((imm) & 0x3) == 0) ? 18 :  2, \
+                                 (((imm) & 0x3) == 0) ? 19 :  3, \
+                                 (((imm) & 0x3) == 1) ? 16 :  4, \
+                                 (((imm) & 0x3) == 1) ? 17 :  5, \
+                                 (((imm) & 0x3) == 1) ? 18 :  6, \
+                                 (((imm) & 0x3) == 1) ? 19 :  7, \
+                                 (((imm) & 0x3) == 2) ? 16 :  8, \
+                                 (((imm) & 0x3) == 2) ? 17 :  9, \
+                                 (((imm) & 0x3) == 2) ? 18 : 10, \
+                                 (((imm) & 0x3) == 2) ? 19 : 11, \
+                                 (((imm) & 0x3) == 3) ? 16 : 12, \
+                                 (((imm) & 0x3) == 3) ? 17 : 13, \
+                                 (((imm) & 0x3) == 3) ? 18 : 14, \
+                                 (((imm) & 0x3) == 3) ? 19 : 15); })
+
+#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)(W)); })
+
+#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512()); })
+
+#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_undefined_pd(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)); })
+
+#define _mm512_getmant_pd(A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_getmant_round_ps(A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)); })
+
+#define _mm512_getmant_ps(A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                           (int)(((C)<<2)|(B)), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm512_getexp_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_getexp_round_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_getexp_round_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_getexp_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_getexp_round_ps(W, U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(W), \
+                                          (__mmask16)(U), (int)(R)); })
+
+#define _mm512_maskz_getexp_round_ps(U, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_getexp_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_undefined_ps (),
+               (__mmask16) -1,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps (),
+               (__mmask16) __U,
+               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_i64gather_ps(index, addr, scale) __extension__ ({ \
+  (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
+                                       (float const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
+                                  __addr, __scale) __extension__({\
+__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
+                              __addr,(__v8di) __index, __mask, __scale);\
+})
+
+#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
+                                        (int const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)-1, (int)(scale)); })
+
+#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64gather_pd(index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
+                                       (double const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
+                                       (double const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64gather_epi64(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_pd(), \
+                                       (long long const *)(addr), \
+                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
+                                       (long long const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_ps(index, addr, scale) __extension__ ({\
+  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
+                                       (float const *)(addr), \
+                                       (__v16sf)(__m512)(index), \
+                                       (__mmask16)-1, (int)(scale)); })
+
+#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v16sf)(__m512)(index), \
+                                       (__mmask16)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_epi32(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
+                                        (int const *)(addr), \
+                                        (__v16si)(__m512i)(index), \
+                                        (__mmask16)-1, (int)(scale)); })
+
+#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v16si)(__m512i)(index), \
+                                        (__mmask16)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_pd(index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
+                                       (double const *)(addr), \
+                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
+                                       (double const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i32gather_epi64(index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
+                                       (long long const *)(addr), \
+                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                       (int)(scale)); })
+
+#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
+                                       (long long const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm512_i64scatter_ps(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \
+                                (__v8di)(__m512i)(index), \
+                                (__v8sf)(__m256)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \
+                                (__v8di)(__m512i)(index), \
+                                (__v8sf)(__m256)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_epi32(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \
+                                (__v8di)(__m512i)(index), \
+                                (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \
+                                (__v8di)(__m512i)(index), \
+                                (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_pd(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \
+                               (__v8di)(__m512i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \
+                               (__v8di)(__m512i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_i64scatter_epi64(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \
+                               (__v8di)(__m512i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \
+                               (__v8di)(__m512i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_ps(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \
+                                (__v16si)(__m512i)(index), \
+                                (__v16sf)(__m512)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \
+                                (__v16si)(__m512i)(index), \
+                                (__v16sf)(__m512)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_epi32(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \
+                                (__v16si)(__m512i)(index), \
+                                (__v16si)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \
+                                (__v16si)(__m512i)(index), \
+                                (__v16si)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_pd(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8df)(__m512d)(v1), (int)(scale)); })
+
+#define _mm512_i32scatter_epi64(addr, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
+  __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8di)(__m512i)(v1), (int)(scale)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          (__v4sf) __A,
+          (__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          (__v4sf) __A,
+          -(__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        (__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
+          (__v4sf) __B,
+          -(__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          -(__v4sf) __A,
+          (__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
+          (__v4sf) __B,
+          (__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\
+  (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+          -(__v4sf) __A,
+          -(__v4sf) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                        -(__v4sf)(__m128)(A), \
+                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
+                                        (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
+          (__v4sf) __B,
+          -(__v4sf) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\
+  (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                         _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
+          (__v4sf) __X,
+          (__v4sf) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
+  (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(X), \
+                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          (__v2df) __A,
+          (__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          (__v2df) __A,
+          -(__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         (__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
+          (__v2df) __B,
+          -(__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          -(__v2df) __A,
+          (__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
+          (__v2df) __B,
+          (__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W,
+          (__v2df) __X,
+          (__v2df) __Y,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                          (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
+          -(__v2df) __A,
+          -(__v2df) __B,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                         -(__v2df)(__m128d)(A), \
+                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                         (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
+          (__v2df) __B,
+          -(__v2df) __C,
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\
+  (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), \
+                                          (__mmask8)(U), \
+                                          _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
+          (__v2df) __X,
+          (__v2df) (__Y),
+          (__mmask8) __U,
+          _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
+  (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(X), \
+                                          (__v2df)(__m128d)(Y), \
+                                          (__mmask8)(U), (int)(R)); })
+
+#define _mm512_permutex_pd(X, C) __extension__ ({ \
+  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
+                                   (__v8df)_mm512_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permutex_pd((X), (C)), \
+                                       (__v8df)(__m512d)(W)); })
+
+#define _mm512_maskz_permutex_pd(U, X, C) __extension__ ({ \
+  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                       (__v8df)_mm512_permutex_pd((X), (C)), \
+                                       (__v8df)_mm512_setzero_pd()); })
+
+#define _mm512_permutex_epi64(X, C) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \
+                                   (__v8di)_mm512_undefined_epi32(), \
+                                   0 + (((C) >> 0) & 0x3), \
+                                   0 + (((C) >> 2) & 0x3), \
+                                   0 + (((C) >> 4) & 0x3), \
+                                   0 + (((C) >> 6) & 0x3), \
+                                   4 + (((C) >> 0) & 0x3), \
+                                   4 + (((C) >> 2) & 0x3), \
+                                   4 + (((C) >> 4) & 0x3), \
+                                   4 + (((C) >> 6) & 0x3)); })
+
+#define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                      (__v8di)(__m512i)(W)); })
+
+#define _mm512_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                      (__v8di)_mm512_setzero_si512()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_undefined_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                 (__v8di) __X,
+                 (__v8df) _mm512_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) _mm512_undefined_epi32 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                 (__v8di) __X,
+                 (__v8di) __W,
+                 __M);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_undefined_ps (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                (__v16si) __X,
+                (__v16sf) _mm512_setzero_ps (),
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_setzero_si512 (),
+                 __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) _mm512_undefined_epi32 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+             __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                 (__v16si) __X,
+                 (__v16si) __W,
+                 __M);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+  return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+  return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_si512 (__m512i * __P, __m512i __A)
+{
+  __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_stream_load_si512 (void *__P)
+{
+  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_pd (double *__P, __m512d __A)
+{
+  __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_stream_ps (float *__P, __m512 __A)
+{
+  __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                  (__v8df)
+                  _mm512_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                  (__v8di)
+                  _mm512_setzero_si512 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                 (__v16sf)
+                 _mm512_setzero_ps (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si) __W,
+                  (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                  (__v16si)
+                  _mm512_setzero_si512 (),
+                  (__mmask16) __U);
+}
+
+#define _mm_cmp_round_ss_mask(X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm_cmp_ss_mask(X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)-1, \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                      (__v4sf)(__m128)(Y), (int)(P), \
+                                      (__mmask8)(M), \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_cmp_round_sd_mask(X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm_cmp_sd_mask(X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)-1, \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), (int)(P), \
+                                      (__mmask8)(M), \
+                                      _MM_FROUND_CUR_DIRECTION); })
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_movehdup_ps (__m512 __A)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+                         1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_movehdup_ps(__A),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_movehdup_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_moveldup_ps (__m512 __A)
+{
+  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+                         0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_moveldup_ps(__A),
+                                             (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                             (__v16sf)_mm512_moveldup_ps(__A),
+                                             (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  __m128 res = __A; 
+  res[0] = (__U & 1) ? __B[0] : __W[0];
+  return res; 
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  __m128 res = __A; 
+  res[0] = (__U & 1) ? __B[0] : 0; 
+  return res; 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  __m128d res = __A; 
+  res[0] = (__U & 1) ? __B[0] : __W[0];
+  return res; 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  __m128d res = __A; 
+  res[0] = (__U & 1) ? __B[0] : 0; 
+  return res; 
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storess128_mask ((__v16sf *)__W, 
+                (__v16sf) _mm512_castps128_ps512(__A),
+                (__mmask16) __U & (__mmask16)1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storesd128_mask ((__v8df *)__W, 
+                (__v8df) _mm512_castpd128_pd512(__A),
+                (__mmask8) __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
+{
+  __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
+                                                (__v4sf) {0.0, 0.0, 0.0, 0.0},
+                                                0, 4, 4, 4);
+
+  return (__m128) __builtin_shufflevector(
+                           __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+                                      (__v16sf) _mm512_castps128_ps512(src),
+                                      (__mmask16) __U & 1),
+                           _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ss (__mmask8 __U, const float* __A)
+{
+  return (__m128) __builtin_shufflevector(
+                           __builtin_ia32_loadss128_mask ((__v16sf *) __A,
+                                      (__v16sf) _mm512_setzero_ps(),
+                                      (__mmask16) __U & 1),
+                           _mm512_undefined_ps(), 0, 1, 2, 3);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
+{
+  __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
+                                                 (__v2df) {0.0, 0.0}, 0, 2);
+
+  return (__m128d) __builtin_shufflevector(
+                            __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+                                      (__v8df) _mm512_castpd128_pd512(src),
+                                      (__mmask8) __U & 1),
+                            _mm512_undefined_pd(), 0, 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_sd (__mmask8 __U, const double* __A)
+{
+  return (__m128d) __builtin_shufflevector(
+                            __builtin_ia32_loadsd128_mask ((__v8df *) __A,
+                                      (__v8df) _mm512_setzero_pd(),
+                                      (__mmask8) __U & 1),
+                            _mm512_undefined_pd(), 0, 1);
+}
+
+#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                   (__v16si)_mm512_undefined_epi32(), \
+                                   0  + (((I) >> 0) & 0x3), \
+                                   0  + (((I) >> 2) & 0x3), \
+                                   0  + (((I) >> 4) & 0x3), \
+                                   0  + (((I) >> 6) & 0x3), \
+                                   4  + (((I) >> 0) & 0x3), \
+                                   4  + (((I) >> 2) & 0x3), \
+                                   4  + (((I) >> 4) & 0x3), \
+                                   4  + (((I) >> 6) & 0x3), \
+                                   8  + (((I) >> 0) & 0x3), \
+                                   8  + (((I) >> 2) & 0x3), \
+                                   8  + (((I) >> 4) & 0x3), \
+                                   8  + (((I) >> 6) & 0x3), \
+                                   12 + (((I) >> 0) & 0x3), \
+                                   12 + (((I) >> 2) & 0x3), \
+                                   12 + (((I) >> 4) & 0x3), \
+                                   12 + (((I) >> 6) & 0x3)); })
+
+#define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)(__m512i)(W)); })
+
+#define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)_mm512_setzero_si512()); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                (__v8df) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                (__v8di) _mm512_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+              (__v8df) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+              (__v8di) _mm512_setzero_pd(),
+              (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) __W,
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+                   (__v16sf) _mm512_setzero_ps(),
+                   (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) __W,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+              (__v16si) _mm512_setzero_ps(),
+              (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+               (__v16sf) _mm512_setzero_ps(),
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) __W,
+                (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                (__v16si) _mm512_setzero_ps(),
+                (__mmask16) __U);
+}
+
+#define _mm512_cvt_roundps_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_cvt_roundps_pd(W, U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)(__m512d)(W), \
+                                           (__mmask8)(U), (int)(R)); })
+
+#define _mm512_maskz_cvt_roundps_pd(U, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_cvtps_pd (__m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_undefined_pd (),
+                (__mmask8) -1,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df) __W,
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                (__v8df)
+                _mm512_setzero_pd (),
+                (__mmask8) __U,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_cvtpslo_pd (__m512 __A)
+{
+  return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
+{
+  return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+              (__v8df) __A,
+              (__v8df) __W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+              (__v8df) __A,
+              (__v8df) _mm512_setzero_pd ());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+             (__v16sf) __A,
+             (__v16sf) __W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+             (__v16sf) __A,
+             (__v16sf) _mm512_setzero_ps ());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+            (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+            (__mmask16) __U);
+}
+
+#define _mm_cvt_roundsd_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)_mm_undefined_ps(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
+{
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
+                                             (__v2df)(__B),
+                                             (__v4sf)(__W), 
+                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
+{
+  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
+                                             (__v2df)(__B),
+                                             (__v4sf)_mm_setzero_ps(), 
+                                             (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtss_i32 _mm_cvtss_si32
+#define _mm_cvtsd_i32 _mm_cvtsd_si32
+#define _mm_cvti32_sd _mm_cvtsi32_sd
+#define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
+#define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
+
+#ifdef __x86_64__
+#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                     (int)(R)); })
+
+#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                     (int)(R)); })
+#endif
+
+#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+
+#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                    (int)(R)); })
+
+#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                    (int)(R)); })
+#endif
+
+#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)_mm_undefined_pd(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U), (int)(R)); })
+
+#define _mm_maskz_cvt_roundss_sd(U, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
+{
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
+                                              (__v4sf)(__B),
+                                              (__v2df)(__W),
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
+{
+  return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
+                                              (__v4sf)(__B),
+                                              (__v2df)_mm_setzero_pd(), 
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
+                                      (unsigned long long)(B), (int)(R)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
+                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
+                                     (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
+                                     (unsigned long long)(B), (int)(R)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
+                _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
+                 __M);
+}
+
+#ifdef __x86_64__
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
+                 __M);
+}
+#endif
+
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+     int __E, int __F, int __G, int __H,
+     int __I, int __J, int __K, int __L,
+     int __M, int __N, int __O, int __P)
+{
+  return __extension__ (__m512i)(__v16si)
+  { __P, __O, __N, __M, __L, __K, __J, __I,
+    __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
+       e8,e9,e10,e11,e12,e13,e14,e15)          \
+  _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
+                   (e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi64 (long long __A, long long __B, long long __C,
+     long long __D, long long __E, long long __F,
+     long long __G, long long __H)
+{
+  return __extension__ (__m512i) (__v8di)
+  { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
+  _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_set_pd (double __A, double __B, double __C, double __D,
+        double __E, double __F, double __G, double __H)
+{
+  return __extension__ (__m512d)
+  { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
+  _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_set_ps (float __A, float __B, float __C, float __D,
+        float __E, float __F, float __G, float __H,
+        float __I, float __J, float __K, float __L,
+        float __M, float __N, float __O, float __P)
+{
+  return __extension__ (__m512)
+  { __P, __O, __N, __M, __L, __K, __J, __I,
+    __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
+  _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
+                (e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_abs_ps(__m512 __A)
+{
+  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS
+_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
+{
+  return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_abs_pd(__m512d __A)
+{
+  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS
+_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
+{
+  return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
+}
+
+// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
+// outputs. This class of vector operation forms the basis of many scientific
+// computations. In vector-reduction arithmetic, the evaluation off is
+// independent of the order of the input elements of V.
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+
+#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1)         \
+  __extension__({                                                      \
+    __m256##T1 Vec256 = __builtin_shufflevector(                       \
+                            (__v8d##T2)Vec512,                         \
+                            (__v8d##T2)Vec512,                         \
+                            0, 1, 2, 3)                                \
+                        Operator                                       \
+                        __builtin_shufflevector(                       \
+                            (__v8d##T2)Vec512,                         \
+                            (__v8d##T2)Vec512,                         \
+                            4, 5, 6, 7);                               \
+    __m128##T1 Vec128 = __builtin_shufflevector(                       \
+                            (__v4d##T2)Vec256,                         \
+                            (__v4d##T2)Vec256,                         \
+                            0, 1)                                      \
+                        Operator                                       \
+                        __builtin_shufflevector(                       \
+                            (__v4d##T2)Vec256,                         \
+                            (__v4d##T2)Vec256,                         \
+                            2, 3);                                     \
+    Vec128 = __builtin_shufflevector((__v2d##T2)Vec128,                \
+                                     (__v2d##T2)Vec128, 0, -1)         \
+             Operator                                                  \
+             __builtin_shufflevector((__v2d##T2)Vec128,                \
+                                     (__v2d##T2)Vec128, 1, -1);        \
+    return Vec128[0];                                                  \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, +, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, *, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, &, i, i);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
+  _mm512_reduce_operator_64bit(__W, |, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
+  _mm512_reduce_operator_64bit(__W, +, f, d);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
+  _mm512_reduce_operator_64bit(__W, *, f, d);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element. 
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for packed double-precision.
+// T3 - Can be Pd for packed double or q for q-word.
+
+#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator,     \
+                                          Mask, T2, T1, T3)                    \
+  __extension__({                                                              \
+    Vec512 = __builtin_ia32_select##T3##_512(                                  \
+                 (__mmask8)Mask,                                               \
+                 (__v8d##T2)Vec512,                                            \
+                 (__v8d##T2)Vec512Neutral);                                    \
+    _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1);                    \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), 
+                                    &, __M,  i, i, q);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, 
+                                    i, i, q);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, 
+                                    f, d, pd);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
+                                    f, d, pd);
+}
+
+// Vec512 - Vector with size 512.
+// Operator - Can be one of following: +,*,&,|
+// T2 - Can get 'i' for int and ' ' for packed single.
+// T1 - Can get 'i' for int and 'f' for float.
+
+#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
+    __m256##T1 Vec256 =                                                        \
+            (__m256##T1)(__builtin_shufflevector(                              \
+                                    (__v16s##T2)Vec512,                        \
+                                    (__v16s##T2)Vec512,                        \
+                                    0, 1, 2, 3, 4, 5, 6, 7)                    \
+                                Operator                                       \
+                         __builtin_shufflevector(                              \
+                                    (__v16s##T2)Vec512,                        \
+                                    (__v16s##T2)Vec512,                        \
+                                    8, 9, 10, 11, 12, 13, 14, 15));            \
+    __m128##T1 Vec128 =                                                        \
+             (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v8s##T2)Vec256,                         \
+                                    (__v8s##T2)Vec256,                         \
+                                    0, 1, 2, 3)                                \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v8s##T2)Vec256,                         \
+                                    (__v8s##T2)Vec256,                         \
+                                    4, 5, 6, 7));                              \
+    Vec128 = (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    0, 1, -1, -1)                              \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    2, 3, -1, -1));                            \
+    Vec128 = (__m128##T1)(__builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    0, -1, -1, -1)                             \
+                                Operator                                       \
+                          __builtin_shufflevector(                             \
+                                    (__v4s##T2)Vec128,                         \
+                                    (__v4s##T2)Vec128,                         \
+                                    1, -1, -1, -1));                           \
+    return Vec128[0];                                                          \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_reduce_add_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, +, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_mul_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, *, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_and_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, &, i, i);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS 
+_mm512_reduce_or_epi32(__m512i __W) {
+  _mm512_reduce_operator_32bit(__W, |, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_add_ps(__m512 __W) {
+  _mm512_reduce_operator_32bit(__W, +, f, );
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_reduce_mul_ps(__m512 __W) {
+  _mm512_reduce_operator_32bit(__W, *, f, );
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - All vector elements set to the identity element. 
+// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
+// Operator - Can be one of following: +,*,&,|
+// Mask - Intrinsic Mask
+// T2  - Can get 'i' for int and 'f' for float.
+// T1 - Can get 'i' for int and 'd' for double.
+// T3 - Can be Ps for packed single or d for d-word.
+
+#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator,     \
+                                          Mask, T2, T1, T3)                    \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                             (__mmask16)Mask,                                  \
+                             (__v16s##T2)Vec512,                               \
+                             (__v16s##T2)Vec512Neutral);                       \
+    _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1);                    \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, 
+                                    i, i, d);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
+}
+
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+// This macro uses only intrinsics from the AVX512F feature.
+
+// Vec512 - Vector with size of 512.
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+
+#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, 2, 3, -1, -1, -1, -1),  \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 4, 5, 6, 7, -1, -1, -1, -1)); \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 0, 1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                 2, 3, -1, -1, -1, -1, -1,     \
+                                                 -1));                         \
+        Vec512 = _mm512_##IntrinName(                                          \
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                0, -1, -1, -1, -1, -1, -1, -1),\
+                                (__m512##T1)__builtin_shufflevector(           \
+                                                (__v8d##T2)Vec512,             \
+                                                (__v8d##T2)Vec512,             \
+                                                1, -1, -1, -1, -1, -1, -1, -1))\
+                                                ;                              \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_epi64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_max_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
+(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu64(__m512i __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS 
+_mm512_reduce_min_pd(__m512d __V) {
+  _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x8000000000000000}
+//                   {max_epu,0x0000000000000000}
+//                   {max_pd, 0xFFF0000000000000}
+//                   {min_epi,0x7FFFFFFFFFFFFFFF}
+//                   {min_epu,0xFFFFFFFFFFFFFFFF}
+//                   {min_pd, 0x7FF0000000000000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+//              __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                             (__mmask8)Mask,                                   \
+                             (__v8d##T2)Vec512,                                \
+                             (__v8d##T2)Vec512Neutral);                        \
+    _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2);                    \
+  })
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
+                                  max_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
+                                  max_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()),
+                                  max_pd, d, f, pd, __M);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                  min_epi64, i, i, q, __M);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+                                  min_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+  _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
+                                  min_pd, d, f, pd, __M);
+}
+
+// Vec512 - Vector with size 512.
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+
+#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, 4, 5, 6, 7,                      \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  8, 9, 10, 11, 12, 13, 14, 15,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, 2, 3, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  4, 5, 6, 7, -1, -1, -1, -1,                  \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0, 1, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  2, 3, -1, -1, -1, -1, -1, -1,                \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    Vec512 = _mm512_##IntrinName(                                              \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  0,  -1, -1, -1, -1, -1, -1, -1,              \
+                                  -1, -1, -1, -1, -1, -1, -1, -1),             \
+                  (__m512##T1)__builtin_shufflevector(                         \
+                                  (__v16s##T2)Vec512,                          \
+                                  (__v16s##T2)Vec512,                          \
+                                  1, -1, -1, -1, -1, -1, -1, -1,               \
+                                  -1, -1, -1, -1, -1, -1, -1, -1));            \
+    return Vec512[0];                                                          \
+  })
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_max_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, max_ps, , f);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu32(__m512i a) {
+  _mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
+  _mm512_reduce_maxMin_32bit(a, min_ps, , f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x80000000}
+//                   {max_epu,0x00000000}
+//                   {max_ps, 0xFF800000}
+//                   {min_epi,0x7FFFFFFF}
+//                   {min_epu,0xFFFFFFFF}
+//                   {min_ps, 0x7F800000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+//              __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+//      [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
+                                        T2, T3, Mask)                          \
+  __extension__({                                                              \
+    Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512(                      \
+                                        (__mmask16)Mask,                       \
+                                        (__v16s##T2)Vec512,                    \
+                                        (__v16s##T2)Vec512Neutral);            \
+   _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2);                     \
+   })
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f,
+                                  ps, __M);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
+                                  i, i, d, __M);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
+                                  i, i, d, __M);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+  _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
+                                  ps, __M);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __AVX512FINTRIN_H
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512ifmaintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512ifmaintrin.h
new file mode 100644
index 000000000..5defbaea8
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512ifmaintrin.h
@@ -0,0 +1,92 @@
+/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAINTRIN_H
+#define __IFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512ifmavlintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512ifmavlintrin.h
new file mode 100644
index 000000000..131ee5cb4
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512ifmavlintrin.h
@@ -0,0 +1,149 @@
+/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAVLINTRIN_H
+#define __IFMAVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512pfintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512pfintrin.h
new file mode 100644
index 000000000..c7fa3cf31
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512pfintrin.h
@@ -0,0 +1,111 @@
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512PFINTRIN_H
+#define __AVX512PFINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
+
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16) -1, \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
+                              (__v16si)(__m512i)(index), (int *)(addr), \
+                              (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vbmiintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vbmiintrin.h
new file mode 100644
index 000000000..837238eda
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vbmiintrin.h
@@ -0,0 +1,137 @@
+/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIINTRIN_H
+#define __VBMIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
+         __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
+              (__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
+               /* idx */ ,
+               (__v64qi) __A,
+               (__v64qi) __B,
+               (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_undefined_epi32 (),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_setzero_si512(),
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) __W,
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) __W,
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_setzero_si512 (),
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_undefined_epi32 (),
+                (__mmask64) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vbmivlintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vbmivlintrin.h
new file mode 100644
index 000000000..105c6d142
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vbmivlintrin.h
@@ -0,0 +1,247 @@
+/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIVLINTRIN_H
+#define __VBMIVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
+              (__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
+         __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
+              (__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16) -
+              1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
+               /* idx */ ,
+               (__v16qi) __A,
+               (__v16qi) __B,
+               (__mmask16)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32) -
+              1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
+               /* idx */ ,
+               (__v32qi) __A,
+               (__v32qi) __B,
+               (__mmask32)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_undefined_si128 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_setzero_si128 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) __W,
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_undefined_si256 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_setzero_si256 (),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) __W,
+                 (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi) __W,
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_setzero_si128 (),
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi) __W,
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_setzero_si256 (),
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_undefined_si256 (),
+                (__mmask32) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlbwintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlbwintrin.h
new file mode 100644
index 000000000..3b58d0433
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlbwintrin.h
@@ -0,0 +1,3172 @@
+/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBWINTRIN_H
+#define __AVX512VLBWINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw")))
+
+static  __inline __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_hi(void){
+    return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/* Integer compare */
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b,
+                                                   __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1,
+                                                 __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
+  return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 (__mmask32)-1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+  return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 (__mmask16)-1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
+  return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4,
+                                                 __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_add_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_add_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_sub_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_sub_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_add_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_add_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_sub_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sub_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_mullo_epi16(__A, __B),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mullo_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+              (__v16qi) __W,
+              (__v16qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+               (__v32qi) __W,
+               (__v32qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+               (__v8hi) __W,
+               (__v8hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+               (__v16hi) __W,
+               (__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_abs_epi8(__A),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_abs_epi8(__A),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_abs_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_abs_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packs_epi32(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                          (__v16hi)_mm256_packs_epi32(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_packs_epi16(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                          (__v32qi)_mm256_packs_epi16(__A, __B),
+                                          (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_packus_epi32(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                         (__v16hi)_mm256_packus_epi32(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                            (__v16qi)_mm_packus_epi16(__A, __B),
+                                            (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                         (__v32qi)_mm256_packus_epi16(__A, __B),
+                                         (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_adds_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_adds_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_adds_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_adds_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_avg_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_avg_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_avg_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                            (__v16hi)_mm256_avg_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epi16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_max_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_max_epu8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_max_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_max_epu16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epi8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epi16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                             (__v16qi)_mm_min_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                             (__v32qi)_mm256_min_epu8(__A, __B),
+                                             (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+                                             (__v8hi)_mm_min_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+                                            (__v16hi)_mm256_min_epu16(__A, __B),
+                                            (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                            (__v16qi)_mm_shuffle_epi8(__A, __B),
+                                            (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                         (__v32qi)_mm256_shuffle_epi8(__A, __B),
+                                         (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epi8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epi8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_subs_epu8(__A, __B),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                            (__v32qi)_mm256_subs_epu8(__A, __B),
+                                            (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_subs_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
+      __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_subs_epu16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+               (__v8hi) __I /* idx */ ,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
+         __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+               (__v16hi) __I /* idx */ ,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I/* idx */,
+               (__v8hi) __A,
+               (__v8hi) __B,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I/* idx */,
+               (__v16hi) __A,
+               (__v16hi) __B,
+               (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                            (__v8hi)_mm_maddubs_epi16(__X, __Y),
+                                            (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
+                          __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                        (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+                                        (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_madd_epi16(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_madd_epi16(__A, __B),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi16_epi8 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                (__v16qi) _mm_setzero_si128(),
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi8 (__m128i __A) {
+
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi16_epi8 (__m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) __O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+               (__v16qi) _mm_setzero_si128(),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+  __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+  __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epu16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epu16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_mulhi_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_mulhi_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
+                                           (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpackhi_epi8(__A, __B),
+                                           (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+                                        (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+                                        (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
+                                           (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpackhi_epi16(__A, __B),
+                                           (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+                                       (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+                                       (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
+                                           (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                           (__v16qi)_mm_unpacklo_epi8(__A, __B),
+                                           (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+                                        (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                        (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+                                        (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
+                                           (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                           (__v8hi)_mm_unpacklo_epi16(__A, __B),
+                                           (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+                                       (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                       (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+                                       (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepi8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepi8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_cvtepu8_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_cvtepu8_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+
+#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)-1); })
+
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                         (__v16qi)(__m128i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
+
+#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)-1); })
+
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
+
+#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                         (__v32qi)(__m256i)(b), (int)(p), \
+                                         (__mmask32)(m)); })
+
+#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)-1); })
+
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
+  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)(m)); })
+
+#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                        (__v8hi)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                         (__v16hi)(__m256i)(b), (int)(p), \
+                                         (__mmask16)(m)); })
+
+#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)-1); })
+
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
+  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)(m)); })
+
+#define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                      (__v8hi)(__m128i)(W)); })
+
+#define _mm_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                      (__v8hi)_mm_setzero_hi()); })
+
+#define _mm256_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                      (__v16hi)(__m256i)(W)); })
+
+#define _mm256_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                      (__v16hi)_mm256_setzero_si256()); })
+
+#define _mm_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                      (__v8hi)(__m128i)(W)); })
+
+#define _mm_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                      (__v8hi)_mm_setzero_hi()); })
+
+#define _mm256_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v16hi)(__m256i)(W)); })
+
+#define _mm256_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                      (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                      (imm)), \
+                                      (__v16hi)_mm256_setzero_si256()); })
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sllv_epi16(__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_sllv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sllv_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sllv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sll_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sll_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srlv_epi16(__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srlv_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srlv_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srlv_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi16(__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                           (__v16hi)_mm256_srav_epi16(__A, __B),
+                                           (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srav_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_sra_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_sra_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srl_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                          (__v16hi)_mm256_srl_epi16(__A, __B),
+                                          (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_srli_epi16(__A, __B),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                         (__v16hi)_mm256_srli_epi16(__A, __B),
+                                         (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+                (__v8hi) __A,
+                (__v8hi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+                (__v8hi) __A,
+                (__v8hi) _mm_setzero_hi ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+                (__v16hi) __A,
+                (__v16hi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+                (__v16hi) __A,
+                (__v16hi) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+                (__v16qi) __A,
+                (__v16qi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+                (__v16qi) __A,
+                (__v16qi) _mm_setzero_hi ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+                (__v32qi) __A,
+                (__v32qi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+                (__v32qi) __A,
+                (__v32qi) _mm256_setzero_si256 ());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                 (__v16qi) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                 (__v16qi)
+                 _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                 (__v32qi) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                 (__v32qi)
+                 _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+                 (__v8hi) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+                 (__v8hi)
+                 _mm_setzero_hi (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+                 (__v16hi) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+                 (__v16hi)
+                 _mm256_setzero_si256 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+                 (__v16qi) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+                 (__v16qi)
+                 _mm_setzero_si128 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+                 (__v32qi) __W,
+                 (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+                 (__v32qi)
+                 _mm256_setzero_si256 (),
+                 (__mmask32) __U);
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
+             (__v8hi) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
+{
+  __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
+             (__v16hi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
+{
+  __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
+             (__v16qi) __A,
+             (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
+{
+  __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
+             (__v32qi) __A,
+             (__mmask32) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_test_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+            (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_test_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B,
+            (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+            (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                 (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_test_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B,
+            (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+            (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_testn_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+             (__v16qi) __B, __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B,
+             (__mmask32) -1);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+             (__v32qi) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+            (__v8hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B,
+             (__mmask16) -1);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+             (__v16hi) __B, __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm_movepi8_mask (__m128i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_movepi8_mask (__m256i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi16_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm256_movepi16_mask (__m256i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi8 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi8 (__mmask32 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi16 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi16 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_broadcastb_epi8(__A),
+                                             (__v16qi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_broadcastb_epi8(__A),
+                                             (__v16qi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__M,
+                                             (__v32qi) _mm256_broadcastb_epi8(__A),
+                                             (__v32qi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__M,
+                                             (__v32qi) _mm256_broadcastb_epi8(__A),
+                                             (__v32qi) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__M,
+                                             (__v8hi) _mm_broadcastw_epi16(__A),
+                                             (__v8hi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__M,
+                                             (__v8hi) _mm_broadcastw_epi16(__A),
+                                             (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__M,
+                                             (__v16hi) _mm256_broadcastw_epi16(__A),
+                                             (__v16hi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__M,
+                                             (__v16hi) _mm256_broadcastw_epi16(__A),
+                                             (__v16hi) _mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_undefined_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) _mm_setzero_si128 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                 (__v8hi) __A,
+                 (__v8hi) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_undefined_si256 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) _mm256_setzero_si256 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                 (__v16hi) __A,
+                 (__v16hi) __W,
+                 (__mmask16) __M);
+}
+
+#define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)(__m128i)(W)); })
+
+#define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+                                 (__v16qi)_mm_setzero_si128()); })
+
+#define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)(__m256i)(W)); })
+
+#define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+                              (__v32qi)_mm256_setzero_si256()); })
+
+#define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)_mm_setzero_hi(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)(__m128i)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \
+                                           (__v16qi)(__m128i)(B), (int)(imm), \
+                                           (__v8hi)_mm_setzero_si128(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_dbsad_epu8(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)_mm256_setzero_si256(), \
+                                           (__mmask16)-1); })
+
+#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)(__m256i)(W), \
+                                           (__mmask16)(U)); })
+
+#define _mm256_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \
+                                           (__v32qi)(__m256i)(B), (int)(imm), \
+                                           (__v16hi)_mm256_setzero_si256(), \
+                                           (__mmask16)(U)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLBWINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlcdintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlcdintrin.h
new file mode 100644
index 000000000..7b02e2e1f
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlcdintrin.h
@@ -0,0 +1,263 @@
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLCDINTRIN_H
+#define __AVX512VLCDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di)
+               _mm_setzero_di (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di)  _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLCDINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vldqintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vldqintrin.h
new file mode 100644
index 000000000..cd9da4370
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vldqintrin.h
@@ -0,0 +1,1200 @@
+/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLDQINTRIN_H
+#define __AVX512VLDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq")))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) ((__v4du) __A * (__v4du) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_mullo_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) ((__v2du) __A * (__v2du) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_mullo_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_andnot_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_andnot_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_andnot_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_andnot_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_and_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_and_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_and_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_and_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_xor_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_xor_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_xor_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_xor_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_or_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_or_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_or_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_or_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu64 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu64 (__m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu64 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                (__v2di) _mm_setzero_si128(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu64 (__m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                (__v4di) _mm256_setzero_si256(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu64_pd (__m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                (__v2df) _mm_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_pd (__m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                (__v4df) _mm256_setzero_pd(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu64_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtepu64_ps (__m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                (__v4sf) _mm_setzero_ps(),
+                (__mmask8) __U);
+}
+
+#define _mm_range_pd(A, B, C) __extension__ ({                         \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)_mm_setzero_pd(), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({          \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)(__m128d)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm_maskz_range_pd(U, A, B, C) __extension__ ({              \
+  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(C), \
+                                          (__v2df)_mm_setzero_pd(), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_range_pd(A, B, C) __extension__ ({                      \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)_mm256_setzero_pd(), \
+                                          (__mmask8)-1); })
+
+#define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({       \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)(__m256d)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({           \
+  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(C), \
+                                          (__v4df)_mm256_setzero_pd(), \
+                                          (__mmask8)(U)); })
+
+#define _mm_range_ps(A, B, C) __extension__ ({                         \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)_mm_setzero_ps(), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({          \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)(__m128)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_range_ps(U, A, B, C) __extension__ ({              \
+  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (int)(C), \
+                                         (__v4sf)_mm_setzero_ps(), \
+                                         (__mmask8)(U)); })
+
+#define _mm256_range_ps(A, B, C) __extension__ ({                      \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)_mm256_setzero_ps(), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({       \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)(__m256)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({           \
+  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(C), \
+                                         (__v8sf)_mm256_setzero_ps(), \
+                                         (__mmask8)(U)); })
+
+#define _mm_reduce_pd(A, B) __extension__ ({                \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_reduce_pd(U, A, B) __extension__ ({     \
+  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_reduce_pd(A, B) __extension__ ({                \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)-1); })
+
+#define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)(__m256d)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({     \
+  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)(U)); })
+
+#define _mm_reduce_ps(A, B) __extension__ ({                   \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1); })
+
+#define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({    \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)(__m128)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm_maskz_reduce_ps(U, A, B) __extension__ ({        \
+  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_reduce_ps(A, B) __extension__ ({                \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1); })
+
+#define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)(__m256)(W), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({     \
+  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U)); })
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi32_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi32_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi32 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi32 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movm_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_movm_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_movepi64_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_movepi64_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x2 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_ps(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_f64x2 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df)_mm256_undefined_pd(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) __O,
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
+                 (__v4df) _mm256_setzero_ps (),
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcast_i32x2 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si)_mm_undefined_si128(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
+                 (__v4si) _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i64x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di)_mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
+                 (__v4di) _mm256_setzero_si256 (),
+                 __M);
+}
+
+#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A),           \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((imm) & 1) ? 2 : 0,           \
+                                   ((imm) & 1) ? 3 : 1); })
+
+#define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)(W)); })
+
+#define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                   (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
+                                   (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A),             \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((imm) & 1) ? 2 : 0,              \
+                                   ((imm) & 1) ? 3 : 1); })
+
+#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)(W)); })
+
+#define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
+                                (__v2di)_mm_setzero_di()); })
+
+#define _mm256_insertf64x2(A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(A), \
+                                 (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
+                                 ((imm) & 0x1) ? 0 : 4, \
+                                 ((imm) & 0x1) ? 1 : 5, \
+                                 ((imm) & 0x1) ? 4 : 2, \
+                                 ((imm) & 0x1) ? 5 : 3); })
+
+#define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)(W)); })
+
+#define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+                                  (__v4df)_mm256_setzero_pd()); })
+
+#define _mm256_inserti64x2(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(A), \
+                                 (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
+                                 ((imm) & 0x1) ? 0 : 4, \
+                                 ((imm) & 0x1) ? 1 : 5, \
+                                 ((imm) & 0x1) ? 4 : 2, \
+                                 ((imm) & 0x1) ? 5 : 3); })
+
+#define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)(W)); })
+
+#define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                  (__v4di)_mm256_setzero_si256()); })
+
+#define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \
+  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlintrin.h
new file mode 100644
index 000000000..f3744da6a
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avx512vlintrin.h
@@ -0,0 +1,8932 @@
+/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLINTRIN_H
+#define __AVX512VLINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl")))
+
+/* Doesn't require avx512vl, used in avx512dqintrin.h */
+static  __inline __m128i __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+_mm_setzero_di(void) {
+  return (__m128i)(__v2di){ 0LL, 0LL};
+}
+
+/* Integer compare */
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0,
+                                                __u);
+}
+
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b,
+                                                  __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                               __u);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                (__mmask8)-1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4,
+                                                __u);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_add_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_add_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sub_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sub_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_add_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_add_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sub_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sub_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epi32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epi32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                             (__v4di)_mm256_mul_epu32(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+                                             (__v2di)_mm_mul_epu32(__X, __Y),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_mullo_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_mullo_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_and_si256(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_and_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                          (__v8si)_mm256_andnot_si256(__A, __B),
+                                          (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_andnot_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_or_si256(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_or_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_xor_si256(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_xor_si128(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_and_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_and_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                          (__v4di)_mm256_andnot_si256(__A, __B),
+                                          (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(),
+                                           __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_andnot_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_or_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_or_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_xor_si256(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A,
+        __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_xor_si128(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                        (__v4si)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                        (__v8si)(__m256i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                        (__v2di)(__m128i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), (int)(p), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                        (__v4di)(__m256i)(b), (int)(p), \
+                                        (__mmask8)(m)); })
+
+#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
+  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                         (__v8sf)(__m256)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm256_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm256_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                         (__v4df)(__m256d)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm_cmp_ps_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_ps_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                         (__v4sf)(__m128)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+#define _mm_cmp_pd_mask(a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                         (__v2df)(__m128d)(b), (int)(p), \
+                                         (__mmask8)-1); })
+
+#define _mm_mask_cmp_pd_mask(m, a, b, p)  __extension__ ({ \
+  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                         (__v2df)(__m128d)(b), (int)(p), \
+                                         (__mmask8)(m)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    -(__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A,
+                                                     (__v2df) __B,
+                                                     -(__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    -(__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A,
+                                                     (__v4df) __B,
+                                                     -(__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   -(__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    -(__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   -(__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    -(__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       -(__v2df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        -(__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       -(__v4df) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        -(__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      -(__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       -(__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
+                         __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      -(__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       -(__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__v2df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__v4df) __C,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__v4sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__v8sf) __C,
+                                                       (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4df) __C,
+                                                      (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8sf) __C,
+                                                     (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_add_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_add_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_add_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_add_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                (__v4si) __W,
+                (__v4si) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                (__v8si) __W,
+                (__v8si) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+                 (__v2df) __W,
+                 (__v2df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+                 (__v4df) __W,
+                 (__v4df) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+                (__v4sf) __W,
+                (__v4sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+                (__v8sf) __W,
+                (__v8sf) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                (__v2di) __W,
+                (__v2di) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                (__v4di) __W,
+                (__v4di) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                  (__v2df)
+                  _mm_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                  (__v4df)
+                  _mm256_setzero_pd (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                  (__v2di)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                  (__v4di)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                 (__v4sf)
+                 _mm_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                 (__v8sf)
+                 _mm256_setzero_ps (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
+  __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+            (__v2df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
+  __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+            (__v4df) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+            (__v2di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+            (__v4di) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
+  __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+            (__v4sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
+  __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+            (__v8sf) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
+  __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+            (__v4si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
+  __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+            (__v8si) __A,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepi32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepi32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf) __W,
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+            (__v4sf)
+            _mm_setzero_ps (),
+            (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                (__v4si)
+                _mm_setzero_si128 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                (__v8si)
+                _mm256_setzero_si256 (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epu32 (__m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epu32 (__m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epu32 (__m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                  (__v4si)
+                  _mm_setzero_si128 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epu32 (__m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si) __W,
+                  (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                  (__v8si)
+                  _mm256_setzero_si256 (),
+                  (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepu32_pd (__m128i __A) {
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+                                              (__v2df)_mm_cvtepu32_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_pd (__m128i __A) {
+  return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+                                              (__v4df)_mm256_cvtepu32_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepu32_ps (__m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepu32_ps (__m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_div_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_div_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_div_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_div_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                (__v2di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                (__v4di) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+              (__v2df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+              (__v4df) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+               (__v2di)
+               _mm_setzero_si128 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+               (__v4di)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+                   (__v4sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+                   (__v8sf) __W,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+               (__v4si)
+               _mm_setzero_si128 (),
+               (__mmask8)     __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+             void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                (__v4si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                (__v8si) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_getexp_pd (__m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_getexp_pd (__m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_getexp_ps (__m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_getexp_ps (__m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_max_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_max_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_max_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_max_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_min_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_min_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_min_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_min_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_mul_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_mul_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_mul_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_mul_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_abs_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U,
+                                             (__v8si)_mm256_abs_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi64 (__m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+             (__v2di)
+             _mm_setzero_si128 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_abs_epi64 (__m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+             (__v4di)
+             _mm256_setzero_si256 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_max_epu32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_max_epu32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_max_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epi64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_min_epu32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_min_epu32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu64 (__m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+        __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_min_epu64 (__m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+           __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              __M);
+}
+
+#define _mm_roundscale_pd(A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1); })
+
+
+#define _mm_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)(__m128d)(W), \
+                                              (__mmask8)(U)); })
+
+
+#define _mm_maskz_roundscale_pd(U, A, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                              (int)(imm), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(U)); })
+
+
+#define _mm256_roundscale_pd(A, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)-1); })
+
+
+#define _mm256_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)(__m256d)(W), \
+                                              (__mmask8)(U)); })
+
+
+#define _mm256_maskz_roundscale_pd(U, A, imm)  __extension__ ({ \
+  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)(U)); })
+
+#define _mm_roundscale_ps(A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1); })
+
+
+#define _mm_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)(__m128)(W), \
+                                             (__mmask8)(U)); })
+
+
+#define _mm_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_roundscale_ps(A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_roundscale_ps(W, U, A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)(__m256)(W), \
+                                             (__mmask8)(U)); })
+
+
+#define _mm256_maskz_roundscale_ps(U, A, imm)  __extension__ ({ \
+  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)(U)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_scalef_pd (__m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                (__v2df) __B,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_scalef_pd (__m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                (__v4df) __B,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_scalef_ps (__m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+               (__v4sf) __B,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_scalef_ps (__m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+               (__v8sf) __B,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)-1, \
+                               (__v2di)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)(mask), \
+                               (__v2di)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)-1, \
+                               (__v4di)(__m256i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({  \
+  __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)(mask), \
+                               (__v4di)(__m256i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_i32scatter_pd(addr, index, v1, scale) __extension__ ({      \
+  __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({        \
+  __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v2df)(__m128d)(v1), (int)(scale)); })
+
+#define _mm_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v2di)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i32scatter_pd(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4df)(__m256d)(v1), (int)(scale)); })
+
+#define _mm256_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4di)(__m256i)(v1), (int)(scale)); })
+
+#define _mm_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+                               (int)(scale)); })
+
+#define _mm_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \
+                               (__v4si)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \
+                               (__v4si)(__m128i)(index), \
+                               (__v4si)(__m128i)(v1), (int)(scale)); })
+
+#define _mm256_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+                               (int)(scale)); })
+
+#define _mm256_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \
+                               (__v8si)(__m256i)(index), \
+                               (__v8si)(__m256i)(v1), (int)(scale)); })
+
+#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \
+  __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \
+                               (__v8si)(__m256i)(index), \
+                               (__v8si)(__m256i)(v1), (int)(scale)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sqrt_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sqrt_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sqrt_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sqrt_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sqrt_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sqrt_ps(__A),
+                                             (__v4sf)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sqrt_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sqrt_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sub_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_sub_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sub_pd(__A, __B),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_sub_pd(__A, __B),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sub_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_sub_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sub_ps(__A, __B),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_sub_ps(__A, __B),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
+              (__v2di) __I
+              /* idx */ ,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
+              (__v4di) __I
+              /* idx */ ,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
+                   (__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
+                   (__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
+                   (__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
+         __mmask8 __U, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
+                   (__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4si) __A,
+                   (__v4si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8si) __A,
+                   (__v8si) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
+        __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+              /* idx */ ,
+              (__v2df) __A,
+              (__v2df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
+         __m128d __B) {
+  return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
+               /* idx */ ,
+               (__v2df) __A,
+               (__v2df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8) -
+              1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
+           __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+              /* idx */ ,
+              (__v4df) __A,
+              (__v4df) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
+            __m256d __B) {
+  return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
+               /* idx */ ,
+               (__v4df) __A,
+               (__v4df) __B,
+               (__mmask8)
+               __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
+        __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                   /* idx */ ,
+                   (__v4sf) __A,
+                   (__v4sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
+         __m128 __B) {
+  return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
+              /* idx */ ,
+              (__v4sf) __A,
+              (__v4sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
+           __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                   /* idx */ ,
+                   (__v8sf) __A,
+                   (__v8sf) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
+            __m256 __B) {
+  return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
+              /* idx */ ,
+              (__v8sf) __A,
+              (__v8sf) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
+           __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                   /* idx */ ,
+                   (__v2di) __A,
+                   (__v2di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
+            __m128i __B) {
+  return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
+              /* idx */ ,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8)
+              __U);
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
+        __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                   /* idx */ ,
+                   (__v4di) __A,
+                   (__v4di) __B,
+                   (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
+         __m256i __I, __m256i __B) {
+  return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
+              /* idx */ ,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi8_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi8_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi8_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi8_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi8_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi8_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi8_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi8_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi32_epi64(__X),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi32_epi64(__X),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi32_epi64(__X),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi32_epi64(__X),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi16_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepi16_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi16_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepi16_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi16_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepi16_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi16_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepi16_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu8_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu8_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu8_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu8_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu8_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu8_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu8_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu8_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu32_epi64(__X),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu32_epi64(__X),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu32_epi64(__X),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu32_epi64(__X),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu16_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_cvtepu16_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu16_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_cvtepu16_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu16_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_cvtepu16_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu16_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_cvtepu16_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+
+#define _mm_rol_epi32(a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_rol_epi32(w, u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)(__m128i)(w), (__mmask8)(u)); })
+
+#define _mm_maskz_rol_epi32(u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)(u)); })
+
+#define _mm256_rol_epi32(a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_rol_epi32(w, u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)(__m256i)(w), (__mmask8)(u)); })
+
+#define _mm256_maskz_rol_epi32(u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)(u)); })
+
+#define _mm_rol_epi64(a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_rol_epi64(w, u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)(__m128i)(w), (__mmask8)(u)); })
+
+#define _mm_maskz_rol_epi64(u, a, b) __extension__ ({\
+  (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)(u)); })
+
+#define _mm256_rol_epi64(a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_rol_epi64(w, u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)(__m256i)(w), (__mmask8)(u)); })
+
+#define _mm256_maskz_rol_epi64(u, a, b) __extension__ ({\
+  (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)(u)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rolv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rolv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rolv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rolv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+#define _mm_ror_epi32(A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)(__m128i)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \
+                                        (__v4si)_mm_setzero_si128(), \
+                                        (__mmask8)(U)); })
+
+#define _mm256_ror_epi32(A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_ror_epi32(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)(__m256i)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_ror_epi32(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \
+                                        (__v8si)_mm256_setzero_si256(), \
+                                        (__mmask8)(U)); })
+
+#define _mm_ror_epi64(A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)-1); })
+
+#define _mm_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)(__m128i)(W), (__mmask8)(U)); })
+
+#define _mm_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \
+                                        (__v2di)_mm_setzero_di(), \
+                                        (__mmask8)(U)); })
+
+#define _mm256_ror_epi64(A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)-1); })
+
+#define _mm256_mask_ror_epi64(W, U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)(__m256i)(W), (__mmask8)(U)); })
+
+#define _mm256_maskz_ror_epi64(U, A, B) __extension__ ({ \
+  (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \
+                                        (__v4di)_mm256_setzero_si256(), \
+                                        (__mmask8)(U)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sll_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sll_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sll_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_sll_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rorv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+              (__v4si) __B,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rorv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+              (__v8si) __B,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rorv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+         __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+              (__v2di) __B,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_rorv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+              (__v4di) __B,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_sllv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_sllv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sllv_epi32(__X, __Y),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_sllv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srlv_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                            (__v4di)_mm256_srlv_epi64(__X, __Y),
+                                            (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srlv_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srlv_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srl_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srl_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srl_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srl_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                            (__v4si)_mm_srav_epi32(__X, __Y),
+                                            (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                            (__v8si)_mm256_srav_epi32(__X, __Y),
+                                            (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srav_epi64(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_srav_epi64(__X, __Y),
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srav_epi64(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_srav_epi64(__X, __Y),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                 (__v4si) __A,
+                 (__v4si) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+                 (__v4si) __A,
+                 (__v4si) _mm_setzero_si128 ());
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                 (__v8si) __A,
+                 (__v8si) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+                 (__v8si) __A,
+                 (__v8si) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+              (__v4si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+              (__v4si)
+              _mm_setzero_si128 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+              (__v8si) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+              (__v8si)
+              _mm256_setzero_si256 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
+          (__v4si) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
+          (__v8si) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                 (__v2di) __A,
+                 (__v2di) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+                 (__v2di) __A,
+                 (__v2di) _mm_setzero_di ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                 (__v4di) __A,
+                 (__v4di) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+                 (__v4di) __A,
+                 (__v4di) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+              (__v2di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+              (__v2di)
+              _mm_setzero_di (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+              (__v4di) __W,
+              (__mmask8)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+              (__v4di)
+              _mm256_setzero_si256 (),
+              (__mmask8)
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
+          (__v2di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
+          (__v4di) __A,
+          (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_movedup_pd(__A),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_movedup_pd(__A),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_movedup_pd(__A),
+                                              (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                              (__v4df)_mm256_movedup_pd(__A),
+                                              (__v4df)_mm256_setzero_pd());
+}
+
+
+#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \
+  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
+                                                  (__v4si)(__m128i)(O), \
+                                                  (__mmask8)(M)); })
+
+#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \
+  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
+                                                  (__v4si)_mm_setzero_si128(), \
+                                                  (__mmask8)(M)); })
+
+#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \
+  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
+                                                  (__v8si)(__m256i)(O), \
+                                                  (__mmask8)(M)); })
+
+#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \
+  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
+                                                  (__v8si)_mm256_setzero_si256(), \
+                                                  (__mmask8)(M)); })
+
+#ifdef __x86_64__
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
+                 __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 __M);
+}
+#endif
+
+#define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2di)(__m128i)(C), \
+                                              (int)(imm), (__mmask8)(U)); })
+
+#define _mm256_fixupimm_pd(A, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                             (__v4df)(__m256d)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                             (__v4df)(__m256d)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (__v4di)(__m256i)(C), \
+                                              (int)(imm), (__mmask8)(U)); })
+
+#define _mm_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_fixupimm_ps(A, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+                                            (__v8sf)(__m256)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+                                            (__v8sf)(__m256)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+               (__v2df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+               (__v4df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+              (__v4sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+              (__v8sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+                 (__v2di)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+               (__v2df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+               (__v2df)
+               _mm_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+               (__v4df) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+               (__v4df)
+               _mm256_setzero_pd (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+              (__v4sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+              (__v4sf)
+              _mm_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+              (__v8sf) __W,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+              (__v8sf)
+              _mm256_setzero_ps (),
+              (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeapd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeapd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
+             (__v2di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
+             (__v4di) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
+             (__v4si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
+             (__v8si) __A,
+             (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeupd128_mask ((__v2df *) __P,
+           (__v2df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeupd256_mask ((__v4df *) __P,
+           (__v4df) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeups128_mask ((__v4sf *) __P,
+           (__v4sf) __A,
+           (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeups256_mask ((__v8sf *) __P,
+           (__v8sf) __A,
+           (__mmask8) __U);
+}
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpackhi_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpackhi_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
+                                           (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpackhi_pd(__A, __B),
+                                           (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpackhi_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
+                                           (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpackhi_ps(__A, __B),
+                                           (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpacklo_pd(__A, __B),
+                                              (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                              (__v2df)_mm_unpacklo_pd(__A, __B),
+                                              (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
+                                           (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                           (__v4df)_mm256_unpacklo_pd(__A, __B),
+                                           (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_unpacklo_ps(__A, __B),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
+                                           (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                           (__v8sf)_mm256_unpacklo_ps(__A, __B),
+                                           (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rcp14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                (__v2df)
+                _mm_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rcp14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                (__v4df)
+                _mm256_setzero_pd (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+               (__v4sf)
+               _mm_setzero_ps (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+               (__v8sf)
+               _mm256_setzero_ps (),
+               (__mmask8) __U);
+}
+
+#define _mm_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_permute_pd((X), (C)), \
+                                       (__v2df)(__m128d)(W)); })
+
+#define _mm_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_permute_pd((X), (C)), \
+                                       (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_mask_permute_pd(W, U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permute_pd((X), (C)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_permute_pd(U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permute_pd((X), (C)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_permute_ps((X), (C)), \
+                                      (__v4sf)(__m128)(W)); })
+
+#define _mm_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_permute_ps((X), (C)), \
+                                      (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_mask_permute_ps(W, U, X, C) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_permute_ps((X), (C)), \
+                                      (__v8sf)(__m256)(W)); })
+
+#define _mm256_maskz_permute_ps(U, X, C) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_permute_ps((X), (C)), \
+                                      (__v8sf)_mm256_setzero_ps()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
+{
+  return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+                                            (__v2df)_mm_permutevar_pd(__A, __C),
+                                            (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
+{
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+                                         (__v4df)_mm256_permutevar_pd(__A, __C),
+                                         (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                            (__v4sf)_mm_permutevar_ps(__A, __C),
+                                            (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                          (__v8sf)_mm256_permutevar_ps(__A, __C),
+                                          (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                 (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                 (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_test_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                 (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_test_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B,
+                 (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                 (__v4di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+            (__v4si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+            (__v8si) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_testn_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+            (__v2di) __B, __U);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B,
+            (__mmask8) -1);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+            (__v4di) __B, __U);
+}
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
+                                           (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpackhi_epi32(__A, __B),
+                                           (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
+                                        (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpackhi_epi32(__A, __B),
+                                        (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpackhi_epi64(__A, __B),
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpackhi_epi64(__A, __B),
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
+                                           (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                           (__v4si)_mm_unpacklo_epi32(__A, __B),
+                                           (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
+                                        (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                        (__v8si)_mm256_unpacklo_epi32(__A, __B),
+                                        (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                           (__v2di)_mm_unpacklo_epi64(__A, __B),
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                        (__v4di)_mm256_unpacklo_epi64(__A, __B),
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_sra_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_sra_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                             (__v2di)_mm_sra_epi64(__A, __B), \
+                                             (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_sra_epi64(__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                           (__v4di)_mm256_sra_epi64(__A, __B), \
+                                           (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi64(__m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+                                           (__v2di)_mm_srai_epi64(__A, __imm), \
+                                           (__v2di)_mm_setzero_di());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_srai_epi64(__m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+                                        (__v4di)_mm256_srai_epi64(__A, __imm), \
+                                        (__v4di)_mm256_setzero_si256());
+}
+
+#define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+                                            (__v4si)(__m128i)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+                                            (__v4si)(__m128i)(B), \
+                                            (__v4si)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
+                                             (__v4si)(__m128i)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+                                            (__v8si)(__m256i)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+                                            (__v8si)(__m256i)(B), \
+                                            (__v8si)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
+                                             (__v8si)(__m256i)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+                                            (__v2di)(__m128i)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+                                            (__v2di)(__m128i)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
+                                             (__v2di)(__m128i)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+                                            (__v4di)(__m256i)(B), \
+                                            (__v4di)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+                                            (__v4di)(__m256i)(B), \
+                                            (__v4di)(__m256i)(C), (int)(imm), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
+                                             (__v4di)(__m256i)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)(U)); })
+
+
+
+#define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)(__m256)(W), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), (int)(imm), \
+                                             (__v8sf)_mm256_setzero_ps(), \
+                                             (__mmask8)(U)); })
+
+#define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)(__m256d)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
+  (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (int)(imm), \
+                                              (__v4df)_mm256_setzero_pd(), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)_mm256_setzero_si256(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)(__m256i)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v8si)_mm256_setzero_si256(), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)_mm256_setzero_si256(), \
+                                              (__mmask8)-1); })
+
+#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)(__m256i)(W), \
+                                              (__mmask8)(U)); })
+
+#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (int)(imm), \
+                                              (__v4di)_mm256_setzero_si256(), \
+                                              (__mmask8)(U)); })
+
+#define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                       (__v2df)(__m128d)(W)); })
+
+#define _mm_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                       (__v2df)_mm_setzero_pd()); })
+
+#define _mm256_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                      (__v4sf)(__m128)(W)); })
+
+#define _mm_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                      (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                      (__v8sf)(__m256)(W)); })
+
+#define _mm256_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                      (__v8sf)_mm256_setzero_ps()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_rsqrt14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                 (__v2df)
+                 _mm_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                 (__v4df)
+                 _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                (__v4sf)
+                _mm_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_f32x4 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf)_mm256_undefined_pd (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) __O,
+                __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                (__v8sf) _mm256_setzero_ps (),
+                __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcast_i32x4 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)_mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
+                 (__v8si)
+                 __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+                 __A,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256(__M,
+                                              (__v4df) _mm256_broadcastsd_pd(__A),
+                                              (__v4df) __O);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m256d)__builtin_ia32_selectpd_256(__M,
+                                              (__v4df) _mm256_broadcastsd_pd(__A),
+                                              (__v4df) _mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128(__M,
+                                             (__v4sf) _mm_broadcastss_ps(__A),
+                                             (__v4sf) __O);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128(__M,
+                                             (__v4sf) _mm_broadcastss_ps(__A),
+                                             (__v4sf) _mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256(__M,
+                                             (__v8sf) _mm256_broadcastss_ps(__A),
+                                             (__v8sf) __O);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256(__M,
+                                             (__v8sf) _mm256_broadcastss_ps(__A),
+                                             (__v8sf) _mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__M,
+                                             (__v4si) _mm_broadcastd_epi32(__A),
+                                             (__v4si) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__M,
+                                             (__v4si) _mm_broadcastd_epi32(__A),
+                                             (__v4si) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__M,
+                                             (__v8si) _mm256_broadcastd_epi32(__A),
+                                             (__v8si) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__M,
+                                             (__v8si) _mm256_broadcastd_epi32(__A),
+                                             (__v8si) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                             (__v2di) _mm_broadcastq_epi64(__A),
+                                             (__v2di) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                             (__v2di) _mm_broadcastq_epi64(__A),
+                                             (__v2di) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                             (__v4di) _mm256_broadcastq_epi64(__A),
+                                             (__v4di) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                             (__v4di) _mm256_broadcastq_epi64(__A),
+                                             (__v4di) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)_mm_setzero_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+               (__v16qi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si)__O,
+               __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+               (__v4si) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtsepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi)_mm_undefined_si128(),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+               (__v8hi) _mm_setzero_si128 (),
+               __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) __O,
+                __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                (__v16qi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                (__v4si) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtusepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtusepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi)_mm_undefined_si128(),
+                (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                (__v8hi) _mm_setzero_si128 (),
+                __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  return __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+              (__v16qi)
+              _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi)_mm_setzero_si128 (),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+              (__v16qi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+              (__v4si) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi)__O,
+              __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi)_mm_undefined_si128(),
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+              (__v8hi) _mm_setzero_si128 (),
+              __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+#define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v8sf)(__m256)(A),           \
+                                  (__v8sf)_mm256_undefined_ps(), \
+                                  ((imm) & 1) ? 4 : 0,           \
+                                  ((imm) & 1) ? 5 : 1,           \
+                                  ((imm) & 1) ? 6 : 2,           \
+                                  ((imm) & 1) ? 7 : 3); })
+
+#define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)(W)); })
+
+#define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \
+  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                   (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
+                                   (__v4sf)_mm_setzero_ps()); })
+
+#define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8si)(__m256)(A),              \
+                                   (__v8si)_mm256_undefined_si256(), \
+                                   ((imm) & 1) ? 4 : 0,              \
+                                   ((imm) & 1) ? 5 : 1,              \
+                                   ((imm) & 1) ? 6 : 2,              \
+                                   ((imm) & 1) ? 7 : 3); })
+
+#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)(W)); })
+
+#define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
+                                (__v4si)_mm_setzero_si128()); })
+
+#define _mm256_insertf32x4(A, B, imm) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(A), \
+                                  (__v8sf)_mm256_castps128_ps256((__m128)(B)), \
+                                  ((imm) & 0x1) ?  0 :  8, \
+                                  ((imm) & 0x1) ?  1 :  9, \
+                                  ((imm) & 0x1) ?  2 : 10, \
+                                  ((imm) & 0x1) ?  3 : 11, \
+                                  ((imm) & 0x1) ?  8 :  4, \
+                                  ((imm) & 0x1) ?  9 :  5, \
+                                  ((imm) & 0x1) ? 10 :  6, \
+                                  ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)(W)); })
+
+#define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
+  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                  (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+                                  (__v8sf)_mm256_setzero_ps()); })
+
+#define _mm256_inserti32x4(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(A), \
+                                 (__v8si)_mm256_castsi128_si256((__m128i)(B)), \
+                                 ((imm) & 0x1) ?  0 :  8, \
+                                 ((imm) & 0x1) ?  1 :  9, \
+                                 ((imm) & 0x1) ?  2 : 10, \
+                                 ((imm) & 0x1) ?  3 : 11, \
+                                 ((imm) & 0x1) ?  8 :  4, \
+                                 ((imm) & 0x1) ?  9 :  5, \
+                                 ((imm) & 0x1) ? 10 :  6, \
+                                 ((imm) & 0x1) ? 11 :  7); })
+
+#define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)(W)); })
+
+#define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                  (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+                                  (__v8si)_mm256_setzero_si256()); })
+
+#define _mm_getmant_pd(A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1); })
+
+#define _mm_mask_getmant_pd(W, U, A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm_maskz_getmant_pd(U, A, B, C) __extension__({\
+  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_getmant_pd(A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)-1); })
+
+#define _mm256_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)(__m256d)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)(U)); })
+
+#define _mm_getmant_ps(A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1); })
+
+#define _mm_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)(__m128)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_getmant_ps(A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1); })
+
+#define _mm256_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)(__m256)(W), \
+                                           (__mmask8)(U)); })
+
+#define _mm256_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
+  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                           (int)(((C)<<2) | (B)), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U)); })
+
+#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v2di)(__m128i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v2di)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v4di)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v4di)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
+                                        (double const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
+                                        (long long const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v4si)(__m128i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v4si)(__m128i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
+                                       (float const *)(addr), \
+                                       (__v8si)(__m256i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
+  (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
+                                        (int const *)(addr), \
+                                        (__v8si)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)); })
+
+#define _mm256_permutex_pd(X, C) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(X), \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+#define _mm256_mask_permutex_pd(W, U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permutex_pd((X), (C)), \
+                                       (__v4df)(__m256d)(W)); })
+
+#define _mm256_maskz_permutex_pd(U, X, C) __extension__ ({ \
+  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_permutex_pd((X), (C)), \
+                                       (__v4df)_mm256_setzero_pd()); })
+
+#define _mm256_permutex_epi64(X, C) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(X), \
+                                   (__v4di)_mm256_undefined_si256(), \
+                                   ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                   ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+#define _mm256_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
+                                      (__v4di)(__m256i)(W)); })
+
+#define _mm256_maskz_permutex_epi64(U, X, C) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                      (__v4di)_mm256_permutex_epi64((X), (C)), \
+                                      (__v4di)_mm256_setzero_si256()); })
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+          __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                 (__v4di) __X,
+                 (__v4df) _mm256_setzero_pd (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_setzero_si256 (),
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) _mm256_undefined_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                 (__v4di) __X,
+                 (__v4di) __W,
+                 __M);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
+          __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_permutexvar_ps (__m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                (__v8si) __X,
+                (__v8sf) _mm256_undefined_si256 (),
+                (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_setzero_si256 (),
+                 __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+             __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) __W,
+                 (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                 (__v8si) __X,
+                 (__v8si) _mm256_undefined_si256(),
+                 (__mmask8) -1);
+}
+
+#define _mm_alignr_epi32(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \
+                                   (__v4si)(__m128i)(A), \
+                                   ((int)(imm) & 0x3) + 0, \
+                                   ((int)(imm) & 0x3) + 1, \
+                                   ((int)(imm) & 0x3) + 2, \
+                                   ((int)(imm) & 0x3) + 3); })
+
+#define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)(__m128i)(W)); })
+
+#define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                    (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+                                    (__v4si)_mm_setzero_si128()); })
+
+#define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \
+                                   (__v8si)(__m256i)(A), \
+                                   ((int)(imm) & 0x7) + 0, \
+                                   ((int)(imm) & 0x7) + 1, \
+                                   ((int)(imm) & 0x7) + 2, \
+                                   ((int)(imm) & 0x7) + 3, \
+                                   ((int)(imm) & 0x7) + 4, \
+                                   ((int)(imm) & 0x7) + 5, \
+                                   ((int)(imm) & 0x7) + 6, \
+                                   ((int)(imm) & 0x7) + 7); })
+
+#define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)(__m256i)(W)); })
+
+#define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                 (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+                                 (__v8si)_mm256_setzero_si256()); })
+
+#define _mm_alignr_epi64(A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \
+                                   (__v2di)(__m128i)(A), \
+                                   ((int)(imm) & 0x1) + 0, \
+                                   ((int)(imm) & 0x1) + 1); })
+
+#define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)(__m128i)(W)); })
+
+#define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                    (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+                                    (__v2di)_mm_setzero_di()); })
+
+#define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \
+                                   (__v4di)(__m256i)(A), \
+                                   ((int)(imm) & 0x3) + 0, \
+                                   ((int)(imm) & 0x3) + 1, \
+                                   ((int)(imm) & 0x3) + 2, \
+                                   ((int)(imm) & 0x3) + 3); })
+
+#define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)(__m256i)(W)); })
+
+#define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                 (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+                                 (__v4di)_mm256_setzero_si256()); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_movehdup_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_movehdup_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_movehdup_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_movehdup_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_moveldup_ps(__A),
+                                             (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                             (__v4sf)_mm_moveldup_ps(__A),
+                                             (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_moveldup_ps(__A),
+                                             (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                             (__v8sf)_mm256_moveldup_ps(__A),
+                                             (__v8sf)_mm256_setzero_ps());
+}
+
+#define _mm256_mask_shuffle_epi32(W, U, A, I) __extension__({\
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)(__m256i)(W)); })
+
+#define _mm256_maskz_shuffle_epi32(U, A, I) __extension__({\
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)_mm256_setzero_si256()); })
+
+#define _mm_mask_shuffle_epi32(W, U, A, I) __extension__({\
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)(__m128i)(W)); })
+
+#define _mm_maskz_shuffle_epi32(U, A, I) __extension__({\
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)_mm_setzero_si128()); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+              (__v2df) __A,
+              (__v2df) __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+              (__v2df) __A,
+              (__v2df) _mm_setzero_pd ());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+              (__v4df) __A,
+              (__v4df) __W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+              (__v4df) __A,
+              (__v4df) _mm256_setzero_pd ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+             (__v4sf) __A,
+             (__v4sf) __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+             (__v4sf) __A,
+             (__v4sf) _mm_setzero_ps ());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+             (__v8sf) __A,
+             (__v8sf) __W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+             (__v8sf) __A,
+             (__v8sf) _mm256_setzero_ps ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf) __W,
+             (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+             (__v4sf)
+             _mm_setzero_ps (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf) __W,
+                (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                (__v8sf)
+                _mm256_setzero_ps (),
+                (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                  (__v8hi) _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+#define _mm_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                         (__v8hi)(__m128i)(W), \
+                                         (__mmask8)(U)); })
+
+#define _mm_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                         (__v8hi)_mm_setzero_si128(), \
+                                         (__mmask8)(U)); })
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                      (__v8hi) __W,
+                                                      (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION,
+                                                      (__v8hi) _mm_setzero_si128(),
+                                                      (__mmask8) __U);
+}
+#define _mm256_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                            (__v8hi)(__m128i)(W), \
+                                            (__mmask8)(U)); })
+
+#define _mm256_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                            (__v8hi)_mm_setzero_si128(), \
+                                            (__mmask8)(U)); })
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avxintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avxintrin.h
new file mode 100644
index 000000000..be03ba346
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/avxintrin.h
@@ -0,0 +1,4894 @@
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+/* Unsigned types */
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v32qs __attribute__((__vector_size__(32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef double __m256d __attribute__((__vector_size__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+
+/* Arithmetic */
+/// \brief Adds two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the sums of both
+///    operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a+(__v4df)__b);
+}
+
+/// \brief Adds two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the sums of both
+///    operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a+(__v8sf)__b);
+}
+
+/// \brief Subtracts two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the differences between
+///    both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a-(__v4df)__b);
+}
+
+/// \brief Subtracts two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the differences between
+///    both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a-(__v8sf)__b);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source operand.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the alternating sums
+///    and differences between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source operand.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
+///    differences between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Divides two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the divisor.
+/// \returns A 256-bit vector of [4 x double] containing the quotients of both
+///    operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a/(__v4df)__b);
+}
+
+/// \brief Divides two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the divisor.
+/// \returns A 256-bit vector of [8 x float] containing the quotients of both
+///    operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a/(__v8sf)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the maximum values
+///    between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the maximum values
+///    between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the minimum values
+///    between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the minimum values
+///    between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Multiplies two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the products of both
+///    operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4df)__a * (__v4df)__b);
+}
+
+/// \brief Multiplies two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the products of both
+///    operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8sf)__a * (__v8sf)__b);
+}
+
+/// \brief Calculates the square roots of the values in a 256-bit vector of
+///    [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the square roots of the
+///    values in the operand.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sqrt_pd(__m256d __a)
+{
+  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+/// \brief Calculates the square roots of the values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the square roots of the
+///    values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+/// \brief Calculates the reciprocal square roots of the values in a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
+///    roots of the values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+/// \brief Calculates the reciprocals of the values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
+///    values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp_ps(__m256 __a)
+{
+  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
+///    by the byte operand. The source values are rounded to integer values and
+///    returned as 64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_round_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
+/// \returns A 256-bit vector of [4 x double] containing the rounded values.
+#define _mm256_round_pd(V, M) __extension__ ({ \
+    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
+
+/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
+///    specified by the byte operand. The source values are rounded to integer
+///    values and returned as floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_round_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used. \n
+///      1: The PE field is not updated. \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M. \n
+///      1: Use the current MXCSR setting. \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest. \n
+///      01: Downward (toward negative infinity). \n
+///      10: Upward (toward positive infinity). \n
+///      11: Truncated.
+/// \returns A 256-bit vector of [8 x float] containing the rounded values.
+#define _mm256_round_ps(V, M) __extension__ ({ \
+  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
+
+/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
+///    source values are rounded up to integer values and returned as 64-bit
+///    double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_ceil_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
+#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
+
+/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
+///    The source values are rounded down to integer values and returned as
+///    64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_floor_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded down
+///    values.
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+
+/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
+///    source values are rounded up to integer values and returned as
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_ceil_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
+#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
+
+/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
+///    source values are rounded down to integer values and returned as
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_floor_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4du)__a & (__v4du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+///    values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8su)__a & (__v8su)__b);
+}
+
+/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values of the second operand and the one's complement of the first
+///    operand.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)(~(__v4du)__a & (__v4du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+///    values of the second operand and the one's complement of the first
+///    operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)(~(__v8su)__a & (__v8su)__b);
+}
+
+/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
+///    values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4du)__a | (__v4du)__b);
+}
+
+/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
+///    values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8su)__a | (__v8su)__b);
+}
+
+/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
+///    values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)((__v4du)__a ^ (__v4du)__b);
+}
+
+/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
+///    values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)((__v8su)__a ^ (__v8su)__b);
+}
+
+/* Horizontal arithmetic */
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal sums of the values are returned in the even-indexed
+///    elements of a vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal sums of the values are returned in the odd-indexed
+///    elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
+///    both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal sums of the values are returned in the elements with
+///    index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal sums of the values are returned in the elements with
+///    index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
+///    both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    even-indexed elements of a vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    odd-indexed elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal
+///    differences of both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source operands.
+///    The horizontal differences between the values are returned in the
+///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal
+///    differences of both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+///    by the 128-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
+///    by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [65]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [129]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [193]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
+///    returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
+///    specified by the 128-bit integer vector operand.
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
+///    specified by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [33:32]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [65:64]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [97:96]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [129:128]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [161:160]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [193:192]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [225:224]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
+}
+
+/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
+///    by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_permute_pd(__m128d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param A
+///    A 128-bit vector of [2 x double].
+/// \param C
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+#define _mm_permute_pd(A, C) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
+                                   (__v2df)_mm_undefined_pd(), \
+                                   ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })
+
+/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
+///    the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute_pd(__m256d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param A
+///    A 256-bit vector of [4 x double].
+/// \param C
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bit [0]: \n
+///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+///         vector. \n
+///      1: Bits [127:64] of the source are copied to bits [63:0] of the
+///         returned vector. \n
+///    Bit [1]: \n
+///      0: Bits [63:0] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///      1: Bits [127:64] of the source are copied to bits [127:64] of the
+///         returned vector. \n
+///    Bit [2]: \n
+///      0: Bits [191:128] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [191:128] of the
+///         returned vector. \n
+///    Bit [3]: \n
+///      0: Bits [191:128] of the source are copied to bits [255:192] of the
+///         returned vector. \n
+///      1: Bits [255:192] of the source are copied to bits [255:192] of the
+///         returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_permute_pd(A, C) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
+                                   (__v4df)_mm256_undefined_pd(), \
+                                   0 + (((C) >> 0) & 0x1), \
+                                   0 + (((C) >> 1) & 0x1), \
+                                   2 + (((C) >> 2) & 0x1), \
+                                   2 + (((C) >> 3) & 0x1)); })
+
+/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
+///    the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_permute_ps(__m128 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param A
+///    A 128-bit vector of [4 x float].
+/// \param C
+///    An immediate integer operand specifying how the values are to be
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+#define _mm_permute_ps(A, C) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
+                                  (__v4sf)_mm_undefined_ps(), \
+                                  ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
+                                  ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })
+
+/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
+///    the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute_ps(__m256 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param A
+///    A 256-bit vector of [8 x float].
+/// \param C
+///    An immediate integer operand specifying how the values are to be \n
+///    copied. \n
+///    Bits [1:0]: \n
+///      00: Bits [31:0] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [31:0] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [31:0] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [63:32] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [31:0] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [95:64] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [31:qq0] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      01: Bits [63:32] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      10: Bits [95:64] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///      11: Bits [127:96] of the source are copied to bits [127:96] of the
+///          returned vector. \n
+///    Bits [1:0]: \n
+///      00: Bits [159:128] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [159:128] of the
+///          returned vector. \n
+///    Bits [3:2]: \n
+///      00: Bits [159:128] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [191:160] of the
+///          returned vector. \n
+///    Bits [5:4]: \n
+///      00: Bits [159:128] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [223:192] of the
+///          returned vector. \n
+///    Bits [7:6]: \n
+///      00: Bits [159:128] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      01: Bits [191:160] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      10: Bits [223:192] of the source are copied to bits [255:224] of the
+///          returned vector. \n
+///      11: Bits [255:224] of the source are copied to bits [255:224] of the
+///          returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_permute_ps(A, C) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
+                                  (__v8sf)_mm256_undefined_ps(), \
+                                  0 + (((C) >> 0) & 0x3), \
+                                  0 + (((C) >> 2) & 0x3), \
+                                  0 + (((C) >> 4) & 0x3), \
+                                  0 + (((C) >> 6) & 0x3), \
+                                  4 + (((C) >> 0) & 0x3), \
+                                  4 + (((C) >> 2) & 0x3), \
+                                  4 + (((C) >> 4) & 0x3), \
+                                  4 + (((C) >> 6) & 0x3)); })
+
+/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+///    [4 x double], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double].
+/// \param V2
+///    A 256-bit vector of [4 x double.
+/// \param M
+///    An immediate integer operand specifying how the values are to be
+///    permuted. \n
+///    Bits [1:0]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///          destination. \n
+///    Bits [5:4]: \n
+///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///          destination. \n
+///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///          destination. \n
+///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///          destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                           (__v4df)(__m256d)(V2), (M)); })
+
+/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
+///    [8 x float], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float].
+/// \param V2
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer operand specifying how the values are to be
+///    permuted. \n
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///    destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                          (__v8sf)(__m256)(V2), (M)); })
+
+/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
+///    as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit integer vector.
+/// \param V2
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer operand specifying how the values are to be copied.
+///    Bits [1:0]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+///    destination. \n
+///    Bits [5:4]: \n
+///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+///    destination. \n
+///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+///    destination. \n
+///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+///    destination.
+/// \returns A 256-bit integer vector containing the copied values.
+#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                           (__v8si)(__m256i)(V2), (M)); })
+
+/* Vector Blend */
+/// \brief Merges 64-bit double-precision data values stored in either of the
+///    two 256-bit vectors of [4 x double], as specified by the immediate
+///    integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double].
+/// \param V2
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An immediate integer operand, with mask bits [3:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 64-bit element in
+///    operand \a V2 is copied to the same position in the destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
+                                   (__v4df)(__m256d)(V2), \
+                                   (((M) & 0x01) ? 4 : 0), \
+                                   (((M) & 0x02) ? 5 : 1), \
+                                   (((M) & 0x04) ? 6 : 2), \
+                                   (((M) & 0x08) ? 7 : 3)); })
+
+/// \brief Merges 32-bit single-precision data values stored in either of the
+///    two 256-bit vectors of [8 x float], as specified by the immediate
+///    integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float].
+/// \param V2
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer operand, with mask bits [7:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
+///    element in operand \a V1 is copied to the same position in the
+///    destination. When a mask bit is 1, the corresponding 32-bit element in
+///    operand \a V2 is copied to the same position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), \
+                                  (((M) & 0x01) ?  8 : 0), \
+                                  (((M) & 0x02) ?  9 : 1), \
+                                  (((M) & 0x04) ? 10 : 2), \
+                                  (((M) & 0x08) ? 11 : 3), \
+                                  (((M) & 0x10) ? 12 : 4), \
+                                  (((M) & 0x20) ? 13 : 5), \
+                                  (((M) & 0x40) ? 14 : 6), \
+                                  (((M) & 0x80) ? 15 : 7)); })
+
+/// \brief Merges 64-bit double-precision data values stored in either of the
+///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
+///    how the values are to be copied. The position of the mask bit corresponds
+///    to the most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 64-bit element in operand \a __a is copied to the same
+///    position in the destination. When a mask bit is 1, the corresponding
+///    64-bit element in operand \a __b is copied to the same position in the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+  return (__m256d)__builtin_ia32_blendvpd256(
+    (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+/// \brief Merges 32-bit single-precision data values stored in either of the
+///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
+///    and 31 specifying how the values are to be copied. The position of the
+///    mask bit corresponds to the most significant bit of a copied value. When
+///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
+///    copied to the same position in the destination. When a mask bit is 1, the
+///    corresponding 32-bit element in operand \a __b is copied to the same
+///    position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+  return (__m256)__builtin_ia32_blendvps256(
+    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+/// \brief Computes two dot products in parallel, using the lower and upper
+///    halves of two [8 x float] vectors as input to the two computations, and
+///    returning the two dot products in the lower and upper halves of the
+///    [8 x float] result. The immediate integer operand controls which input
+///    elements will contribute to the dot product, and where the final results
+///    are returned. In general, for each dot product, the four corresponding
+///    elements of the input vectors are multiplied; the first two and second
+///    two products are summed, then the two sums are added to form the final
+///    result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
+///
+/// \param V1
+///    A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param V2
+///    A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param M
+///    An immediate integer argument. Bits [7:4] determine which elements of
+///    the input vectors are used, with bit [4] corresponding to the lowest
+///    element and bit [7] corresponding to the highest element of each [4 x
+///    float] subvector. If a bit is set, the corresponding elements from the
+///    two input vectors are used as an input for dot product; otherwise that
+///    input is treated as zero. Bits [3:0] determine which elements of the
+///    result will receive a copy of the final dot product, with bit [0]
+///    corresponding to the lowest element and bit [3] corresponding to the
+///    highest element of each [4 x float] subvector. If a bit is set, the dot
+///    product is returned in the corresponding element; otherwise that element
+///    is set to zero. The bitmask is applied in the same way to each of the
+///    two parallel dot product computations.
+/// \returns A 256-bit vector of [8 x float] containing the two dot products.
+#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                 (__v8sf)(__m256)(V2), (M)); })
+
+/* Vector shuffle */
+/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
+///    specified by the immediate value operand. The four selected elements in
+///    each operand are copied to the destination according to the bits
+///    specified in the immediate operand. The selected elements from the first
+///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
+///    destination, and the selected elements from the second 256-bit operand
+///    are copied to bits [127:64] and bits [255:192] of the destination. For
+///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
+///    the 256-bit destination vector would contain the following values: b[7],
+///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float]. The four selected elements in this
+///    operand are copied to bits [63:0] and bits [191:128] in the destination,
+///    according to the bits specified in the immediate operand.
+/// \param b
+///    A 256-bit vector of [8 x float]. The four selected elements in this
+///    operand are copied to bits [127:64] and bits [255:192] in the
+///    destination, according to the bits specified in the immediate operand.
+/// \param mask
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from \a a and \a b \n.
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
+///    The destinations within the 256-bit destination are assigned values as
+///    follows, according to the bit value assignments described below: \n
+///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
+///    the destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
+///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
+///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
+///    11: Bits [127:96] and [255:224] are copied from the selected operand.
+/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
+#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
+  (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), \
+                                  0  + (((mask) >> 0) & 0x3), \
+                                  0  + (((mask) >> 2) & 0x3), \
+                                  8  + (((mask) >> 4) & 0x3), \
+                                  8  + (((mask) >> 6) & 0x3), \
+                                  4  + (((mask) >> 0) & 0x3), \
+                                  4  + (((mask) >> 2) & 0x3), \
+                                  12 + (((mask) >> 4) & 0x3), \
+                                  12 + (((mask) >> 6) & 0x3)); })
+
+/// \brief Selects four double-precision values from the 256-bit operands of
+///    [4 x double], as specified by the immediate value operand. The selected
+///    elements from the first 256-bit operand are copied to bits [63:0] and
+///    bits [191:128] in the destination, and the selected elements from the
+///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
+///    the destination. For example, if bits [3:0] of the immediate operand
+///    contain a value of 0xF, the 256-bit destination vector would contain the
+///    following values: b[3], a[3], b[1], a[1].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double].
+/// \param b
+///    A 256-bit vector of [4 x double].
+/// \param mask
+///    An immediate value containing 8-bit values specifying which elements to
+///    copy from \a a and \a b: \n
+///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
+///    destination. \n
+///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
+///    destination. \n
+///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
+///    destination. \n
+///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
+///    destination. \n
+///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
+#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
+  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), \
+                                   0 + (((mask) >> 0) & 0x1), \
+                                   4 + (((mask) >> 1) & 0x1), \
+                                   2 + (((mask) >> 2) & 0x1), \
+                                   6 + (((mask) >> 3) & 0x1)); })
+
+/* Compare */
+#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
+#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
+#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
+#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
+#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
+#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
+#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
+#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
+#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
+#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
+#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
+#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
+#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
+#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
+#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
+#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
+#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
+
+/// \brief Compares each of the corresponding double-precision values of two
+///    128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand. Returns a [2 x double] vector consisting of
+///    two doubles corresponding to the two comparison results: zero if the
+///    comparison is false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_pd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+/// \brief Compares each of the corresponding values of two 128-bit vectors of
+///    [4 x float], using the operation specified by the immediate integer
+///    operand. Returns a [4 x float] vector consisting of four floats
+///    corresponding to the four comparison results: zero if the comparison is
+///    false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ps(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+/// \brief Compares each of the corresponding double-precision values of two
+///    256-bit vectors of [4 x double], using the operation specified by the
+///    immediate integer operand. Returns a [4 x double] vector consisting of
+///    four doubles corresponding to the four comparison results: zero if the
+///    comparison is false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double].
+/// \param b
+///    A 256-bit vector of [4 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 256-bit vector of [4 x double] containing the comparison results.
+#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
+  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                   (__v4df)(__m256d)(b), (c)); })
+
+/// \brief Compares each of the corresponding values of two 256-bit vectors of
+///    [8 x float], using the operation specified by the immediate integer
+///    operand. Returns a [8 x float] vector consisting of eight floats
+///    corresponding to the eight comparison results: zero if the comparison is
+///    false, and all 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float].
+/// \param b
+///    A 256-bit vector of [8 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 256-bit vector of [8 x float] containing the comparison results.
+#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
+  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                  (__v8sf)(__m256)(b), (c)); })
+
+/// \brief Compares each of the corresponding scalar double-precision values of
+///    two 128-bit vectors of [2 x double], using the operation specified by the
+///    immediate integer operand. If the result is true, all 64 bits of the
+///    destination vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_sd(a, b, c) __extension__ ({ \
+  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                (__v2df)(__m128d)(b), (c)); })
+
+/// \brief Compares each of the corresponding scalar values of two 128-bit
+///    vectors of [4 x float], using the operation specified by the immediate
+///    integer operand. If the result is true, all 32 bits of the destination
+///    vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying which comparison
+///    operation to use: \n
+///    00h, 08h, 10h, 18h: Equal \n
+///    01h, 09h, 11h, 19h: Less than \n
+///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
+///                        (swapped operands) \n
+///    03h, 0Bh, 13h, 1Bh: Unordered \n
+///    04h, 0Ch, 14h, 1Ch: Not equal \n
+///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
+///                        (swapped operands) \n
+///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
+///                        (swapped operands) \n
+///    07h, 0Fh, 17h, 1Fh: Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ss(a, b, c) __extension__ ({ \
+  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                               (__v4sf)(__m128)(b), (c)); })
+
+/// \brief Takes a [8 x i32] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x i32].
+/// \param __imm
+///    An immediate integer operand with bits [2:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 32 bits of extended
+///    packed data.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi32(__m256i __a, const int __imm)
+{
+  __v8si __b = (__v8si)__a;
+  return __b[__imm & 7];
+}
+
+/// \brief Takes a [16 x i16] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [16 x i16].
+/// \param __imm
+///    An immediate integer operand with bits [3:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
+///    packed data.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi16(__m256i __a, const int __imm)
+{
+  __v16hi __b = (__v16hi)__a;
+  return (unsigned short)__b[__imm & 15];
+}
+
+/// \brief Takes a [32 x i8] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [32 x i8].
+/// \param __imm
+///    An immediate integer operand with bits [4:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
+///    packed data.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_extract_epi8(__m256i __a, const int __imm)
+{
+  __v32qi __b = (__v32qi)__a;
+  return (unsigned char)__b[__imm & 31];
+}
+
+#ifdef __x86_64__
+/// \brief Takes a [4 x i64] vector and returns the vector element value
+///    indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 256-bit integer vector of [4 x i64].
+/// \param __imm
+///    An immediate integer operand with bits [1:0] determining which vector
+///    element is extracted and returned.
+/// \returns A 64-bit integer containing the extracted 64 bits of extended
+///    packed data.
+static __inline long long  __DEFAULT_FN_ATTRS
+_mm256_extract_epi64(__m256i __a, const int __imm)
+{
+  __v4di __b = (__v4di)__a;
+  return __b[__imm & 3];
+}
+#endif
+
+/// \brief Takes a [8 x i32] vector and replaces the vector element value
+///    indexed by the immediate constant operand by a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [8 x i32] to be used by the insert operation.
+/// \param __b
+///    An integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
+{
+  __v8si __c = (__v8si)__a;
+  __c[__imm & 7] = __b;
+  return (__m256i)__c;
+}
+
+
+/// \brief Takes a [16 x i16] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [16 x i16] to be used by the insert operation.
+/// \param __b
+///    An i16 integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
+{
+  __v16hi __c = (__v16hi)__a;
+  __c[__imm & 15] = __b;
+  return (__m256i)__c;
+}
+
+/// \brief Takes a [32 x i8] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [32 x i8] to be used by the insert operation.
+/// \param __b
+///    An i8 integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///    \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
+{
+  __v32qi __c = (__v32qi)__a;
+  __c[__imm & 31] = __b;
+  return (__m256i)__c;
+}
+
+#ifdef __x86_64__
+/// \brief Takes a [4 x i64] vector and replaces the vector element value
+///    indexed by the immediate constant operand with a new value. Returns the
+///    modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A vector of [4 x i64] to be used by the insert operation.
+/// \param __b
+///    A 64-bit integer value. The replacement value for the insert operation.
+/// \param __imm
+///    An immediate integer specifying the index of the vector element to be
+///    replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+///     \a __imm with \a __b.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
+{
+  __v4di __c = (__v4di)__a;
+  __c[__imm & 3] = __b;
+  return (__m256i)__c;
+}
+#endif
+
+/* Conversion */
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_pd(__m128i __a)
+{
+  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
+}
+
+/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit vector of [8 x float] containing the converted values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_ps(__m256i __a)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
+}
+
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtpd_ps(__m256d __a)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
+///    x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtps_pd(__m128 __a)
+{
+  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
+}
+
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32], truncating the result by rounding towards zero when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+///    x i32]. When a conversion is inexact, the value returned is rounded
+///    according to the rounding control bits in the MXCSR register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi32(__m256d __a)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
+///    truncating the result by rounding towards zero when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi32(__m256 __a)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+static __inline double __DEFAULT_FN_ATTRS
+_mm256_cvtsd_f64(__m256d __a)
+{
+ return __a[0];
+}
+
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_cvtsi256_si32(__m256i __a)
+{
+ __v8si __b = (__v8si)__a;
+ return __b[0];
+}
+
+static __inline float __DEFAULT_FN_ATTRS
+_mm256_cvtss_f32(__m256 __a)
+{
+ return __a[0];
+}
+
+/* Vector replicate */
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
+///    vector of [8 x float] to float values in a 256-bit vector of
+///    [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_movehdup_ps(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
+///    vector of [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
+///    the return value. \n
+///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
+///    the return value. \n
+///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
+///    return value. \n
+///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
+///    return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+///    values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_moveldup_ps(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+/// \brief Moves and duplicates double-precision floating point values from a
+///    256-bit vector of [4 x double] to double-precision values in a 256-bit
+///    vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double]. \n
+///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
+///    return value. \n
+///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
+///    the return value.
+/// \returns A 256-bit vector of [4 x double] containing the moved and
+///    duplicated values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_movedup_pd(__m256d __a)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the return value. \n
+///    Bits [255:192] are written to bits [191:128] of the return value. \n
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the return value. \n
+///    Bits [255:192] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
+}
+
+/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
+///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the return value. \n
+///    Bits [191:128] are written to bits [191:128] of the return value.
+/// \param __b
+///    A 256-bit floating-point vector of [4 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the return value. \n
+///    Bits [191:128] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
+}
+
+/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the return value. \n
+///    Bits [127:96] are written to bits [95:64] of the return value. \n
+///    Bits [223:192] are written to bits [159:128] of the return value. \n
+///    Bits [255:224] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [95:64] are written to bits [63:32] of the return value. \n
+///    Bits [127:96] are written to bits [127:96] of the return value. \n
+///    Bits [223:192] are written to bits [191:160] of the return value. \n
+///    Bits [255:224] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
+///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the return value. \n
+///    Bits [63:32] are written to bits [95:64] of the return value. \n
+///    Bits [159:128] are written to bits [159:128] of the return value. \n
+///    Bits [191:160] are written to bits [223:192] of the return value.
+/// \param __b
+///    A 256-bit vector of [8 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the return value. \n
+///    Bits [63:32] are written to bits [127:96] of the return value. \n
+///    Bits [159:128] are written to bits [191:160] of the return value. \n
+///    Bits [191:160] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the ZF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns the CF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
+///    element-by-element comparison of the double-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
+///    element-by-element comparison of the double-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of double-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \param __b
+///    A 256-bit vector of [4 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision element in the
+///    first source vector and the corresponding element in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
+///    element-by-element comparison of the single-precision elements in the
+///    first source vector and the corresponding elements in the second source
+///    vector. The EFLAGS register is updated as follows: \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+///    ZF flag is set to 1. \n
+///    If there is at least one pair of single-precision elements where the
+///    sign-bit of the first element is 0 and the sign-bit of the second element
+///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \param __b
+///    A 256-bit vector of [8 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
+///    of the two source vectors and update the EFLAGS register as follows: \n
+///    If there is at least one pair of bits where both bits are 1, the ZF flag
+///    is set to 0. Otherwise the ZF flag is set to 1. \n
+///    If there is at least one pair of bits where the bit from the first source
+///    vector is 0 and the bit from the second source vector is 1, the CF flag
+///    is set to 0. Otherwise the CF flag is set to 1. \n
+///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+///    otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __b
+///    A 256-bit integer vector.
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+/// \brief Extracts the sign bits of double-precision floating point elements
+///    in a 256-bit vector of [4 x double] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the double-precision
+///    floating point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0].
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_pd(__m256d __a)
+{
+  return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+/// \brief Extracts the sign bits of double-precision floating point elements
+///    in a 256-bit vector of [8 x float] and writes them to the lower order
+///    bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the double-precision floating
+///    point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [7:0].
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_ps(__m256 __a)
+{
+  return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+/// \brief Zeroes the contents of all XMM or YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroall(void)
+{
+  __builtin_ia32_vzeroall();
+}
+
+/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_zeroupper(void)
+{
+  __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+/// \brief Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+/// \brief Loads a scalar double-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [4 x double] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
+///
+/// \param __a
+///    The double-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_sd(double const *__a)
+{
+  double __d = *__a;
+  return (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+/// \brief Loads a scalar single-precision floating point value from the
+///    specified address pointed to by \a __a and broadcasts it to the elements
+///    of a [8 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+///    The single-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ss(float const *__a)
+{
+  float __f = *__a;
+  return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+/// \brief Loads the data from a 128-bit vector of [2 x double] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [2 x double] to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_pd(__m128d const *__a)
+{
+  return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
+}
+
+/// \brief Loads the data from a 128-bit vector of [4 x float] from the
+///    specified address pointed to by \a __a and broadcasts it to 128-bit
+///    elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+///    The 128-bit vector of [4 x float] to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
+///    equal to the broadcast value.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ps(__m128 const *__a)
+{
+  return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
+}
+
+/* SIMD load ops */
+/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing
+///    double-precision floating point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_load_pd(double const *__p)
+{
+  return *(__m256d *)__p;
+}
+
+/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_load_ps(float const *__p)
+{
+  return *(__m256 *)__p;
+}
+
+/// \brief Loads 4 double-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing double-precision floating
+///    point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu_pd(double const *__p)
+{
+  struct __loadu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__p)->__v;
+}
+
+/// \brief Loads 8 single-precision floating point values from an unaligned
+///    memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing single-precision floating
+///    point values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu_ps(float const *__p)
+{
+  struct __loadu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
+///    location pointed to by \a __p into elements of a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
+///    values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_load_si256(__m256i const *__p)
+{
+  return *__p;
+}
+
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu_si256(__m256i const *__p)
+{
+  struct __loadu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si256*)__p)->__v;
+}
+
+/// \brief Loads 256 bits of integer data from an unaligned memory location
+///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
+///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
+///    line boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_lddqu_si256(__m256i const *__p)
+{
+  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+/// \brief Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to a 32-byte aligned memory location pointed to by
+///    \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    double-precision floaing point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_pd(double *__p, __m256d __a)
+{
+  *(__m256d *)__p = __a;
+}
+
+/// \brief Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_ps(float *__p, __m256 __a)
+{
+  *(__m256 *)__p = __a;
+}
+
+/// \brief Stores double-precision floating point values from a 256-bit vector
+///    of [4 x double] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the double-precision
+///    floating point values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+  struct __storeu_pd {
+    __m256d __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pd*)__p)->__v = __a;
+}
+
+/// \brief Stores single-precision floating point values from a 256-bit vector
+///    of [8 x float] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+  struct __storeu_ps {
+    __m256 __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ps*)__p)->__v = __a;
+}
+
+/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
+///    aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will receive the
+///    integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+  *__p = __a;
+}
+
+/// \brief Stores integer values from a 256-bit integer vector to an unaligned
+///    memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_si256(__m256i *__p, __m256i __a)
+{
+  struct __storeu_si256 {
+    __m256i __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si256*)__p)->__v = __a;
+}
+
+/* Conditional load ops */
+/// \brief Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [2 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm_maskload_pd(double const *__p, __m128i __m)
+{
+  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
+}
+
+/// \brief Conditionally loads double-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [4 x double], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the double-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element represents the mask bits. If a
+///    mask bit is zero, the corresponding value in the memory location is not
+///    loaded and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [4 x double] containing the loaded values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_maskload_pd(double const *__p, __m256i __m)
+{
+  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+                                               (__v4di)__m);
+}
+
+/// \brief Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 128-bit vector of
+///    [4 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each data element represents the mask bits. If a mask bit is zero, the
+///    corresponding value in the memory location is not loaded and the
+///    corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_maskload_ps(float const *__p, __m128i __m)
+{
+  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
+}
+
+/// \brief Conditionally loads single-precision floating point elements from a
+///    memory location pointed to by \a __p into a 256-bit vector of
+///    [8 x float], depending on the mask bits associated with each data
+///    element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the single-precision
+///    floating point values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element represents the mask bits. If a mask
+///    bit is zero, the corresponding value in the memory location is not loaded
+///    and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [8 x float] containing the loaded values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_maskload_ps(float const *__p, __m256i __m)
+{
+  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
+}
+
+/* Conditional store ops */
+/// \brief Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [8 x dword] containing the mask. The most
+///    significant bit of each dword element in the mask vector represents the
+///    mask bits. If a mask bit is zero, the corresponding value from vector
+///    \a __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
+{
+  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
+}
+
+/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector \a __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
+{
+  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
+}
+
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a memory location pointed to by \a __p, according to the specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
+///    significant bit of each quadword element in the mask vector represents
+///    the mask bits. If a mask bit is zero, the corresponding value from vector
+///    __a is not stored and the corresponding field in the memory location
+///    pointed to by \a __p is not changed.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
+{
+  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
+}
+
+/// \brief Moves single-precision floating point values from a 128-bit vector
+///    of [4 x float] to a memory location pointed to by \a __p, according to
+///    the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __m
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each field in the mask vector represents the mask bits. If a mask bit is
+///    zero, the corresponding value from vector __a is not stored and the
+///    corresponding field in the memory location pointed to by \a __p is not
+///    changed.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
+{
+  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    integer values.
+/// \param __b
+///    A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
+}
+
+/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
+///    to a 32-byte aligned memory location. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    integer values.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
+}
+
+/// \brief Moves single-precision floating point values from a 256-bit vector
+///    of [8 x float] to a 32-byte aligned memory location. To minimize
+///    caching, the data is flagged as non-temporal (unlikely to be used again
+///    soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-byte aligned memory location that will receive the
+///    single-precision floating point values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
+}
+
+/* Create vectors */
+/// \brief Create a 256-bit vector of [4 x double] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [4 x double] containing undefined values.
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_undefined_pd(void)
+{
+  return (__m256d)__builtin_ia32_undef256();
+}
+
+/// \brief Create a 256-bit vector of [8 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [8 x float] containing undefined values.
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_undefined_ps(void)
+{
+  return (__m256)__builtin_ia32_undef256();
+}
+
+/// \brief Create a 256-bit integer vector with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit integer vector containing undefined values.
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_undefined_si256(void)
+{
+  return (__m256i)__builtin_ia32_undef256();
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __d, __c, __b, __a };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
+///    with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+              float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+                 int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+                 short __w11, short __w10, short __w09, short __w08,
+                 short __w07, short __w06, short __w05, short __w04,
+                 short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+                char __b27, char __b26, char __b25, char __b24,
+                char __b23, char __b22, char __b21, char __b20,
+                char __b19, char __b18, char __b17, char __b16,
+                char __b15, char __b14, char __b13, char __b12,
+                char __b11, char __b10, char __b09, char __b08,
+                char __b07, char __b06, char __b05, char __b04,
+                char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+  };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized with the specified
+///    64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A double-precision floating-point value used to initialize bits [63:0]
+///    of the result.
+/// \param __b
+///    A double-precision floating-point value used to initialize bits [127:64]
+///    of the result.
+/// \param __c
+///    A double-precision floating-point value used to initialize bits [191:128]
+///    of the result.
+/// \param __d
+///    A double-precision floating-point value used to initialize bits [255:192]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+  return (__m256d){ __a, __b, __c, __d };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float],
+///    initialized in reverse order with the specified single-precision
+///    float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __a
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \param __b
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __c
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __d
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __e
+///    A single-precision floating-point value used to initialize bits [159:128]
+///    of the result.
+/// \param __f
+///    A single-precision floating-point value used to initialize bits [191:160]
+///    of the result.
+/// \param __g
+///    A single-precision floating-point value used to initialize bits [223:192]
+///    of the result.
+/// \param __h
+///    A single-precision floating-point value used to initialize bits [255:224]
+///    of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+               float __e, float __f, float __g, float __h)
+{
+  return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i4
+///    A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i5
+///    A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i6
+///    A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i7
+///    A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+                  int __i4, int __i5, int __i6, int __i7)
+{
+  return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __w15
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w14
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w13
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w12
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w11
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w10
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w09
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w08
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w07
+///    A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w06
+///    A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w05
+///    A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w04
+///    A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w03
+///    A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w02
+///    A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w01
+///    A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w00
+///    A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+       short __w11, short __w10, short __w09, short __w08,
+       short __w07, short __w06, short __w05, short __w04,
+       short __w03, short __w02, short __w01, short __w00)
+{
+  return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
+    __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///   instruction.
+///
+/// \param __b31
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b30
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b29
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b28
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b27
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b26
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b25
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b24
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b23
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b22
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b21
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b20
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b19
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b18
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b17
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b16
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b09
+///    An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b08
+///    An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b07
+///    An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b06
+///    An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b05
+///    An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b04
+///    An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b03
+///    An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b02
+///    An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b01
+///    An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b00
+///    An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+                 char __b27, char __b26, char __b25, char __b24,
+                 char __b23, char __b22, char __b21, char __b20,
+                 char __b19, char __b18, char __b17, char __b16,
+                 char __b15, char __b14, char __b13, char __b12,
+                 char __b11, char __b10, char __b09, char __b08,
+                 char __b07, char __b06, char __b05, char __b04,
+                 char __b03, char __b02, char __b01, char __b00)
+{
+  return (__m256i)(__v32qi){
+    __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
+    __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
+    __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
+    __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
+}
+
+/// \brief Constructs a 256-bit integer vector, initialized in reverse order
+///    with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \param __b
+///    A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __c
+///    A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __d
+///    A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+  return (__m256i)(__v4di){ __a, __b, __c, __d };
+}
+
+/* Create vectors with repeated elements */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
+///    of the four double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set1_pd(double __w)
+{
+  return (__m256d){ __w, __w, __w, __w };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
+///    of the eight single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set1_ps(float __w)
+{
+  return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
+///    32-bit integral vector elements set to the specified 32-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+///   instruction.
+///
+/// \param __i
+///    A 32-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [8 x i32].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi32(int __i)
+{
+  return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
+}
+
+/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
+///    16-bit integral vector elements set to the specified 16-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __w
+///    A 16-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [16 x i16].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi16(short __w)
+{
+  return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
+    __w, __w, __w, __w, __w, __w };
+}
+
+/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
+///    8-bit integral vector elements set to the specified 8-bit integral value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __b
+///    An 8-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [32 x i8].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi8(char __b)
+{
+  return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
+    __b, __b, __b, __b, __b, __b, __b };
+}
+
+/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
+///    64-bit integral vector elements set to the specified 64-bit integral
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __q
+///    A 64-bit integral value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 256-bit integer vector of [4 x i64].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi64x(long long __q)
+{
+  return (__m256i)(__v4di){ __q, __q, __q, __q };
+}
+
+/* Create __zeroed vectors */
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setzero_pd(void)
+{
+  return (__m256d){ 0, 0, 0, 0 };
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
+///    vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setzero_ps(void)
+{
+  return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/// \brief Constructs a 256-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit integer vector initialized to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setzero_si256(void)
+{
+  return (__m256i){ 0LL, 0LL, 0LL, 0LL };
+}
+
+/* Cast between vector types */
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    floating-point vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castpd_ps(__m256d __a)
+{
+  return (__m256)__a;
+}
+
+/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castpd_si256(__m256d __a)
+{
+  return (__m256i)__a;
+}
+
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    floating-point vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castps_pd(__m256 __a)
+{
+  return (__m256d)__a;
+}
+
+/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castps_si256(__m256 __a)
+{
+  return (__m256i)__a;
+}
+
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castsi256_ps(__m256i __a)
+{
+  return (__m256)__a;
+}
+
+/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
+///    of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castsi256_pd(__m256i __a)
+{
+  return (__m256d)__a;
+}
+
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [4 x double] as a 128-bit floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the
+///    lower 128 bits of the parameter.
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm256_castpd256_pd128(__m256d __a)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
+}
+
+/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
+///    [8 x float] as a 128-bit floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the
+///    lower 128 bits of the parameter.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_castps256_ps128(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
+}
+
+/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 128-bit integer vector containing the lower 128 bits of the
+///    parameter.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_castsi256_si128(__m256i __a)
+{
+  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The contents of the upper 128
+///    bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The contents of the upper 128 bits are
+///    undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+///    contain the value of the parameter. The contents of the upper 128 bits
+///    are undefined.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The contents of the upper 128 bits are undefined.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
+}
+
+/*
+   Vector insert.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
+///    a 256-bit vector of [8 x float] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [4 x float] in the second parameter. The immediate
+///    integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [4 x float]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
+  (__m256)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V1), \
+    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
+    (((M) & 1) ?  0 :  8), \
+    (((M) & 1) ?  1 :  9), \
+    (((M) & 1) ?  2 : 10), \
+    (((M) & 1) ?  3 : 11), \
+    (((M) & 1) ?  8 :  4), \
+    (((M) & 1) ?  9 :  5), \
+    (((M) & 1) ? 10 :  6), \
+    (((M) & 1) ? 11 :  7) );})
+
+/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
+///    a 256-bit vector of [4 x double] given in the first parameter, and then
+///    replacing either the upper or the lower 128 bits with the contents of a
+///    128-bit vector of [2 x double] in the second parameter. The immediate
+///    integer parameter determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double]. This vector is copied to the result
+///    first, and then either the upper or the lower 128 bits of the result will
+///    be replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit vector of [2 x double]. The contents of this parameter are
+///    written to either the upper or the lower 128 bits of the result depending
+///    on the value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
+  (__m256d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V1), \
+    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+/// \brief Constructs a new 256-bit integer vector by first duplicating a
+///    256-bit integer vector given in the first parameter, and then replacing
+///    either the upper or the lower 128 bits with the contents of a 128-bit
+///    integer vector in the second parameter. The immediate integer parameter
+///    determines between the upper or the lower 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+///    A 256-bit integer vector. This vector is copied to the result first, and
+///    then either the upper or the lower 128 bits of the result will be
+///    replaced by the contents of \a V2.
+/// \param V2
+///    A 128-bit integer vector. The contents of this parameter are written to
+///    either the upper or the lower 128 bits of the result depending on the
+///     value of parameter \a M.
+/// \param M
+///    An immediate integer. The least significant bit determines how the values
+///    from the two parameters are interleaved: \n
+///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
+///    result. \n
+///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+///    result.
+/// \returns A 256-bit integer vector containing the interleaved values.
+#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
+  (__m256i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V1), \
+    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
+    (((M) & 1) ? 0 : 4), \
+    (((M) & 1) ? 1 : 5), \
+    (((M) & 1) ? 4 : 2), \
+    (((M) & 1) ? 5 : 3) );})
+
+/*
+   Vector extract.
+   We use macros rather than inlines because we only want to accept
+   invocations where the immediate M is a constant expression.
+*/
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [8 x float], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
+#define _mm256_extractf128_ps(V, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector( \
+    (__v8sf)(__m256)(V), \
+    (__v8sf)(_mm256_undefined_ps()), \
+    (((M) & 1) ? 4 : 0), \
+    (((M) & 1) ? 5 : 1), \
+    (((M) & 1) ? 6 : 2), \
+    (((M) & 1) ? 7 : 3) );})
+
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
+///    of [4 x double], as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double].
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter: \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
+#define _mm256_extractf128_pd(V, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector( \
+    (__v4df)(__m256d)(V), \
+    (__v4df)(_mm256_undefined_pd()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
+///    integer vector, as determined by the immediate integer parameter, and
+///    returns the extracted bits as a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer. The least significant bit determines which bits are
+///    extracted from the first parameter:  \n
+///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+///    result. \n
+///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit integer vector containing the extracted bits.
+#define _mm256_extractf128_si256(V, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector( \
+    (__v4di)(__m256i)(V), \
+    (__v4di)(_mm256_undefined_si256()), \
+    (((M) & 1) ? 2 : 0), \
+    (((M) & 1) ? 3 : 1) );})
+
+/* SIMD load ops (unaligned) */
+/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [8 x float] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
+  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
+}
+
+/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [4 x double] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
+  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
+}
+
+/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
+///    constructs a 256-bit integer vector by concatenating the two 128-bit
+///    vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[255:128] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[127:0] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
+{
+  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
+  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
+}
+
+/* SIMD store ops (unaligned) */
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [8 x float] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  _mm_storeu_ps(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  _mm_storeu_ps(__addr_hi, __v128);
+}
+
+/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [4 x double] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  _mm_storeu_pd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  _mm_storeu_pd(__addr_hi, __v128);
+}
+
+/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
+///    two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit integer vector.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  _mm_storeu_si128(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  _mm_storeu_si128(__addr_hi, __v128);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_m128 (__m128 __hi, __m128 __lo)
+{
+  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_m128d (__m128d __hi, __m128d __lo)
+{
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_m128i (__m128i __hi, __m128i __lo)
+{
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
+///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
+///    similar to _mm256_set_m128, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_m128 (__m128 __lo, __m128 __hi)
+{
+  return _mm256_set_m128(__hi, __lo);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
+///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
+///    similar to _mm256_set_m128d, but the order of the input parameters is
+///    swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
+///    128 bits of the result.
+/// \param __hi
+///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
+///    128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_m128d (__m128d __lo, __m128d __hi)
+{
+  return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
+///    integer vectors. This is similar to _mm256_set_m128i, but the order of
+///    the input parameters is swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+///    A 128-bit integer vector to be copied to the lower 128 bits of the
+///    result.
+/// \param __hi
+///    A 128-bit integer vector to be copied to the upper 128 bits of the
+///    result.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_m128i (__m128i __lo, __m128i __hi)
+{
+  return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVXINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/bmi2intrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/bmi2intrin.h
new file mode 100644
index 000000000..fdae82cf2
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/bmi2intrin.h
@@ -0,0 +1,95 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __BMI2INTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/bmiintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/bmiintrin.h
new file mode 100644
index 000000000..488eb2dbd
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/bmiintrin.h
@@ -0,0 +1,548 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _tzcnt_u16(unsigned short a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param a
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _andn_u32(unsigned int a, unsigned int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param a
+///    An unsigned integer containing one of the operands.
+/// \param b
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+
+/* _bextr_u32 != __bextr_u32 */
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsi_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param a
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+#define _blsi_u32(a)      (__blsi_u32((a)))
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsmsk_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param a
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsr_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param a
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+#define _blsr_u32(a)      (__blsr_u32((a)))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _tzcnt_u32(unsigned int a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param a
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+   instruction behaves as BSF on non-BMI targets, there is code that expects
+   to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned short __RELAXED_FN_ATTRS
+__tzcnt_u16(unsigned short __X)
+{
+  return __X ? __builtin_ctzs(__X) : 16;
+}
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
+///    specify the index of the least significant bit. Bits [15:8] specify the
+///    number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ int __RELAXED_FN_ATTRS
+_mm_tzcnt_32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+#ifdef __x86_64__
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param b
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+
+/* _bextr_u64 != __bextr_u64 */
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsi_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+#define _blsi_u64(a)      (__blsi_u64((a)))
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsmsk_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns A unsigned 64-bit integer containing the newly created mask.
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsr_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+#define _blsr_u64(a)      (__blsr_u64((a)))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _tzcnt_u64(unsigned long long a);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param a
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
+///    the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///     in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns A unsigned 64-bit integer containing the newly created mask.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ long long __RELAXED_FN_ATTRS
+_mm_tzcnt_64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __RELAXED_FN_ATTRS
+
+#endif /* __BMIINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/clflushoptintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/clflushoptintrin.h
new file mode 100644
index 000000000..60e0ead76
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/clflushoptintrin.h
@@ -0,0 +1,41 @@
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLFLUSHOPTINTRIN_H
+#define __CLFLUSHOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflushopt(char * __m) {
+  __builtin_ia32_clflushopt(__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cpuid.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cpuid.h
new file mode 100644
index 000000000..400dcfacd
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cpuid.h
@@ -0,0 +1,215 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Responses identification request with %eax 0 */
+/* AMD:     "AuthenticAMD" */
+#define signature_AMD_ebx 0x68747541
+#define signature_AMD_edx 0x69746e65
+#define signature_AMD_ecx 0x444d4163
+/* CENTAUR: "CentaurHauls" */
+#define signature_CENTAUR_ebx 0x746e6543
+#define signature_CENTAUR_edx 0x48727561
+#define signature_CENTAUR_ecx 0x736c7561
+/* CYRIX:   "CyrixInstead" */
+#define signature_CYRIX_ebx 0x69727943
+#define signature_CYRIX_edx 0x736e4978
+#define signature_CYRIX_ecx 0x64616574
+/* INTEL:   "GenuineIntel" */
+#define signature_INTEL_ebx 0x756e6547
+#define signature_INTEL_edx 0x49656e69
+#define signature_INTEL_ecx 0x6c65746e
+/* TM1:     "TransmetaCPU" */
+#define signature_TM1_ebx 0x6e617254
+#define signature_TM1_edx 0x74656d73
+#define signature_TM1_ecx 0x55504361
+/* TM2:     "GenuineTMx86" */
+#define signature_TM2_ebx 0x756e6547
+#define signature_TM2_edx 0x54656e69
+#define signature_TM2_ecx 0x3638784d
+/* NSC:     "Geode by NSC" */
+#define signature_NSC_ebx 0x646f6547
+#define signature_NSC_edx 0x43534e20
+#define signature_NSC_ecx 0x79622065
+/* NEXGEN:  "NexGenDriven" */
+#define signature_NEXGEN_ebx 0x4778654e
+#define signature_NEXGEN_edx 0x72446e65
+#define signature_NEXGEN_ecx 0x6e657669
+/* RISE:    "RiseRiseRise" */
+#define signature_RISE_ebx 0x65736952
+#define signature_RISE_edx 0x65736952
+#define signature_RISE_ecx 0x65736952
+/* SIS:     "SiS SiS SiS " */
+#define signature_SIS_ebx 0x20536953
+#define signature_SIS_edx 0x20536953
+#define signature_SIS_ecx 0x20536953
+/* UMC:     "UMC UMC UMC " */
+#define signature_UMC_ebx 0x20434d55
+#define signature_UMC_edx 0x20434d55
+#define signature_UMC_ecx 0x20434d55
+/* VIA:     "VIA VIA VIA " */
+#define signature_VIA_ebx 0x20414956
+#define signature_VIA_edx 0x20414956
+#define signature_VIA_ecx 0x20414956
+/* VORTEX:  "Vortex86 SoC" */
+#define signature_VORTEX_ebx 0x74726f56
+#define signature_VORTEX_edx 0x36387865
+#define signature_VORTEX_ecx 0x436f5320
+
+/* Features in %ecx for level 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE4_1      bit_SSE41       /* for gcc compat */
+#define bit_SSE42       0x00100000
+#define bit_SSE4_2      bit_SSE42       /* for gcc compat */
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_AES         bit_AESNI       /* for gcc compat */
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_F16C        0x20000000
+#define bit_RDRND       0x40000000
+
+/* Features in %edx for level 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_CMPXCHG8B   bit_CX8         /* for gcc compat */
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR        /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for level 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SMEP        0x00000080
+#define bit_ENH_MOVSB   0x00000200
+
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__level), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level))
+
+#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__level), "2"(__count))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/algorithm b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/algorithm
new file mode 100644
index 000000000..95d9beb73
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/algorithm
@@ -0,0 +1,96 @@
+/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
+#define __CLANG_CUDA_WRAPPERS_ALGORITHM
+
+// This header defines __device__ overloads of std::min/max, but only if we're
+// <= C++11.  In C++14, these functions are constexpr, and so are implicitly
+// __host__ __device__.
+//
+// We don't support the initializer_list overloads because
+// initializer_list::begin() and end() are not __host__ __device__ functions.
+//
+// When compiling in C++14 mode, we could force std::min/max to have different
+// implementations for host and device, by declaring the device overloads
+// before the constexpr overloads appear.  We choose not to do this because
+
+//  a) why write our own implementation when we can use one from the standard
+//     library? and
+//  b) libstdc++ is evil and declares min/max inside a header that is included
+//     *before* we include <algorithm>.  So we'd have to unconditionally
+//     declare our __device__ overloads of min/max, but that would pollute
+//     things for people who choose not to include <algorithm>.
+
+#include_next <algorithm>
+
+#if __cplusplus <= 201103L
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__a, __b) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__b, __a) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif // __cplusplus <= 201103L
+#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/complex b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/complex
new file mode 100644
index 000000000..11d40a82a
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/complex
@@ -0,0 +1,82 @@
+/*===---- complex - CUDA wrapper for <complex> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
+#define __CLANG_CUDA_WRAPPERS_COMPLEX
+
+// Wrapper around <complex> that forces its functions to be __host__
+// __device__.
+
+// First, include host-only headers we think are likely to be included by
+// <complex>, so that the pragma below only applies to <complex> itself.
+#if __cplusplus >= 201103L
+#include <type_traits>
+#endif
+#include <stdexcept>
+#include <cmath>
+#include <sstream>
+
+// Next, include our <algorithm> wrapper, to ensure that device overloads of
+// std::min/max are available.
+#include <algorithm>
+
+#pragma clang force_cuda_host_device begin
+
+// When compiling for device, ask libstdc++ to use its own implements of
+// complex functions, rather than calling builtins (which resolve to library
+// functions that don't exist when compiling CUDA device code).
+//
+// This is a little dicey, because it causes libstdc++ to define a different
+// set of overloads on host and device.
+//
+//   // Present only when compiling for host.
+//   __host__ __device__ void complex<float> sin(const complex<float>& x) {
+//     return __builtin_csinf(x);
+//   }
+//
+//   // Present when compiling for host and for device.
+//   template <typename T>
+//   void __host__ __device__ complex<T> sin(const complex<T>& x) {
+//     return complex<T>(sin(x.real()) * cosh(x.imag()),
+//                       cos(x.real()), sinh(x.imag()));
+//   }
+//
+// This is safe because when compiling for device, all function calls in
+// __host__ code to sin() will still resolve to *something*, even if they don't
+// resolve to the same function as they resolve to when compiling for host.  We
+// don't care that they don't resolve to the right function because we won't
+// codegen this host code when compiling for device.
+
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#define _GLIBCXX_USE_C99_COMPLEX 0
+#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
+
+#include_next <complex>
+
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
+
+#pragma clang force_cuda_host_device end
+
+#endif // include guard
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/new b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/new
new file mode 100644
index 000000000..b77131af0
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/cuda_wrappers/new
@@ -0,0 +1,47 @@
+/*===---- complex - CUDA wrapper for <new> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_NEW
+#define __CLANG_CUDA_WRAPPERS_NEW
+
+#include_next <new>
+
+// Device overrides for placement new and delete.
+#pragma push_macro("CUDA_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define CUDA_NOEXCEPT noexcept
+#else
+#define CUDA_NOEXCEPT
+#endif
+
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+#pragma pop_macro("CUDA_NOEXCEPT")
+
+#endif // include guard
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/emmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/emmintrin.h
new file mode 100644
index 000000000..1512f9f0b
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/emmintrin.h
@@ -0,0 +1,4791 @@
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16)));
+
+/* Type defines.  */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+/* Unsigned types */
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
+typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v16qs __attribute__((__vector_size__(16)));
+
+#include <f16cintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+
+/// \brief Adds lower double-precision values in both operands and returns the
+///    sum in the lower 64 bits of the result. The upper 64 bits of the result
+///    are copied from the upper double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
+///    from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+/// \brief Adds two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the sums of both
+///    operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a + (__v2df)__b);
+}
+
+/// \brief Subtracts the lower double-precision value of the second operand
+///    from the lower double-precision value of the first operand and returns
+///    the difference in the lower 64 bits of the result. The upper 64 bits of
+///    the result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    difference of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+/// \brief Subtracts two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the differences between
+///    both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a - (__v2df)__b);
+}
+
+/// \brief Multiplies lower double-precision values in both operands and returns
+///    the product in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    product of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+/// \brief Multiplies two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the products of both
+///    operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a * (__v2df)__b);
+}
+
+/// \brief Divides the lower double-precision value of the first operand by the
+///    lower double-precision value of the second operand and returns the
+///    quotient in the lower 64 bits of the result. The upper 64 bits of the
+///    result are copied from the upper double-precision value of the first
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing divisor.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    quotient of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+/// \brief Performs an element-by-element division of two 128-bit vectors of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the divisor.
+/// \returns A 128-bit vector of [2 x double] containing the quotients of both
+///    operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2df)__a / (__v2df)__b);
+}
+
+/// \brief Calculates the square root of the lower double-precision value of
+///    the second operand and returns it in the lower 64 bits of the result.
+///    The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    upper 64 bits of this operand are copied to the upper 64 bits of the
+///    result.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    square root is calculated using the lower 64 bits of this operand.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    square root of the lower 64 bits of operand \a __b, and whose upper 64
+///    bits are copied from the upper 64 bits of operand \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Calculates the square root of the each of two values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the square roots of the
+///    values in the operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_pd(__m128d __a)
+{
+  return __builtin_ia32_sqrtpd((__v2df)__a);
+}
+
+/// \brief Compares lower 64-bit double-precision values of both operands, and
+///    returns the lesser of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    minimum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the lesser of each pair of
+///    values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the minimum values
+///    between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares lower 64-bits double-precision values of both operands, and
+///    returns the greater of the pair of values in the lower 64-bits of the
+///    result. The upper 64 bits of the result are copied from the upper double-
+///    precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands. The
+///    lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    maximum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Performs element-by-element comparison of the two 128-bit vectors of
+///    [2 x double] and returns the vector containing the greater of each pair
+///    of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the maximum values
+///    between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2du)__a & (__v2du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
+///    the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values in the second operand and the one's complement of the first
+///    operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)(~(__v2du)__a & (__v2du)__b);
+}
+
+/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
+///    values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2du)__a | (__v2du)__b);
+}
+
+/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
+///    values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)((__v2du)__a ^ (__v2du)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] for equality. Each comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are less than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand. A pair
+///    of double-precision values are "ordered" with respect to each other if
+///    neither value is a NaN. Each comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand. A pair
+///    of double-precision values are "unordered" with respect to each other if
+///    one or both values are NaN. Each comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are unequal to those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than those in the second operand. Each comparison
+///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than those in the second operand. Each
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares each of the corresponding double-precision values of the
+///    128-bit vectors of [2 x double] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \param __b
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "ordered" with respect to the
+///    corresponding value in the second parameter. The comparison yields 0h for
+///    false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
+///    "ordered" with respect to each other if neither value is a NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is "unordered" with respect to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
+///    are "unordered" with respect to each other if one or both values are NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than the corresponding
+///    value in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than the corresponding
+///    value in the second parameter. The comparison yields 0h for false,
+///    FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is not greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0h
+///    for false, FFFFFFFFFFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
+  return (__m128d) { __c[0], __a[1] };
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] for equality. The
+///    comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true. If
+///    either of the two lower double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is less than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true. If either of the two lower double-precision values is
+///    NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than the corresponding value
+///    in the second parameter. The comparison yields 0 for false, 1 for true.
+///    If either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///     A 128-bit vector of [2 x double]. The lower double-precision value is
+///     compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///     lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is greater than or equal to the
+///    corresponding value in the second parameter. The comparison yields 0 for
+///    false, 1 for true.  If either of the two lower double-precision values
+///    is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Compares the lower double-precision floating-point values in each of
+///    the two 128-bit floating-point vectors of [2 x double] to determine if
+///    the value in the first parameter is unequal to the corresponding value in
+///    the second parameter. The comparison yields 0 for false, 1 for true. If
+///    either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __b.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision value is
+///    compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison result. If either of the two
+///    lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two single-precision floating-point
+///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
+///    The upper 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpd_ps(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2ps((__v2df)__a);
+}
+
+/// \brief Converts the lower two single-precision floating-point elements of a
+///    128-bit vector of [4 x float] into two double-precision floating-point
+///    values, returned in a 128-bit vector of [2 x double]. The upper two
+///    elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower two single-precision
+///    floating-point elements are converted to double-precision values. The
+///    upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtps_pd(__m128 __a)
+{
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
+}
+
+/// \brief Converts the lower two integer elements of a 128-bit vector of
+///    [4 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double]. The upper two elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
+///    converted to double-precision values. The upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi32_pd(__m128i __a)
+{
+  return (__m128d) __builtin_convertvector(
+      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
+///    64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi32(__m128d __a)
+{
+  return __builtin_ia32_cvtpd2dq((__v2df)__a);
+}
+
+/// \brief Converts the low-order element of a 128-bit vector of [2 x double]
+///    into a 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si((__v2df)__a);
+}
+
+/// \brief Converts the lower double-precision floating-point element of a
+///    128-bit vector of [2 x double], in the second parameter, into a
+///    single-precision floating-point value, returned in the lower 32 bits of a
+///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
+///    copied from the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
+///    copied to the upper 96 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower double-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
+///    converted value from the second parameter. The upper 96 bits are copied
+///    from the upper 96 bits of the first parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
+}
+
+/// \brief Converts a 32-bit signed integer value, in the second parameter, into
+///    a double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 32-bit signed integer containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+/// \brief Converts the lower single-precision floating-point element of a
+///    128-bit vector of [4 x float], in the second parameter, into a
+///    double-precision floating-point value, returned in the lower 64 bits of
+///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+///    are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+///    copied to the upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower single-precision
+///    floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+///    converted value from the second parameter. The upper 64 bits are copied
+///    from the upper 64 bits of the first parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+  __a[0] = __b[0];
+  return __a;
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
+///    result of either conversion is inexact, the result is truncated (rounded
+///    towards zero) regardless of the current MXCSR setting. The upper 64 bits
+///    of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+///    converted values. The upper 64 bits are set to zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi32(__m128d __a)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
+}
+
+/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
+///    signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_si32(__m128d __a)
+{
+  return __builtin_ia32_cvttsd2si((__v2df)__a);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+}
+
+/// \brief Converts the two double-precision floating-point elements of a
+///    128-bit vector of [2 x double] into two signed 32-bit integer values,
+///    returned in a 64-bit vector of [2 x i32]. If the result of either
+///    conversion is inexact, the result is truncated (rounded towards zero)
+///    regardless of the current MXCSR setting.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttpd_pi32(__m128d __a)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+}
+
+/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
+///    [2 x i32] into two double-precision floating-point values, returned in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtpi32_pd(__m64 __a)
+{
+  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
+///    a double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
+/// \returns A double-precision floating-point value copied from the lower 64
+///    bits of \a __a.
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm_cvtsd_f64(__m128d __a)
+{
+  return __a[0];
+}
+
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_pd(double const *__dp)
+{
+  return *(__m128d*)__dp;
+}
+
+/// \brief Loads a double-precision floating-point value from a specified memory
+///    location and duplicates it to both vector elements of a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing a double-precision value.
+/// \returns A 128-bit vector of [2 x double] containing the loaded and
+///    duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load1_pd(double const *__dp)
+{
+  struct __mm_load1_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __u };
+}
+
+#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
+
+/// \brief Loads two double-precision values, in reverse order, from an aligned
+///    memory location into a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
+/// needed shuffling instructions. In AVX mode, the shuffling may be combined
+/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
+///
+/// \param __dp
+///    A 16-byte aligned pointer to an array of double-precision values to be
+///    loaded in reverse order.
+/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
+///    values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadr_pd(double const *__dp)
+{
+  __m128d __u = *(__m128d*)__dp;
+  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
+}
+
+/// \brief Loads a 128-bit floating-point vector of [2 x double] from an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadu_pd(double const *__dp)
+{
+  struct __loadu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_pd*)__dp)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si64(void const *__a)
+{
+  struct __loadu_si64 {
+    long long __v;
+  } __attribute__((__packed__, __may_alias__));
+  long long __u = ((struct __loadu_si64*)__a)->__v;
+  return (__m128i){__u, 0L};
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_sd(double const *__dp)
+{
+  struct __mm_load_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  return (__m128d){ __u, 0 };
+}
+
+/// \brief Loads a double-precision value into the high-order bits of a 128-bit
+///    vector of [2 x double]. The low-order bits are copied from the low-order
+///    bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [127:64] of the result. The address of the memory location does not have
+///    to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  return (__m128d){ __a[0], __u };
+}
+
+/// \brief Loads a double-precision value into the low-order bits of a 128-bit
+///    vector of [2 x double]. The high-order bits are copied from the
+///    high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the result.
+/// \param __dp
+///    A pointer to a 64-bit memory location containing a double-precision
+///    floating-point value that is loaded. The loaded value is written to bits
+///    [63:0] of the result. The address of the memory location does not have to
+///    be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+  struct __mm_loadl_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  return (__m128d){ __u, __a[1] };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] with
+///    unspecified content. This could be used as an argument to another
+///    intrinsic function where the argument is required but the value is not
+///    actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
+///    content.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_undefined_pd(void)
+{
+  return (__m128d)__builtin_ia32_undef128();
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits of the vector are initialized with the specified double-precision
+///    floating-point value. The upper 64 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
+///    lower 64 bits contain the value of the parameter. The upper 64 bits are
+///    set to zero.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_sd(double __w)
+{
+  return (__m128d){ __w, 0 };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
+///    of the two double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set1_pd(double __w)
+{
+  return (__m128d){ __w, __w };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd(double __w, double __x)
+{
+  return (__m128d){ __x, __w };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double],
+///    initialized in reverse order with the specified double-precision
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize the lower 64
+///    bits of the result.
+/// \param __x
+///    A double-precision floating-point value used to initialize the upper 64
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setr_pd(double __w, double __x)
+{
+  return (__m128d){ __w, __x };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]
+///    initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [2 x double] with
+///    all elements set to zero.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setzero_pd(void)
+{
+  return (__m128d){ 0, 0 };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
+///    64 bits are set to the lower 64 bits of the second parameter. The upper
+///    64 bits are set to the upper 64 bits of the first parameter.
+//
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
+///    upper 64 bits of the result.
+/// \param __b
+///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
+///    lower 64 bits of the result.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+  return (__m128d){ __b[0], __a[1] };
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_sd(double *__dp, __m128d __a)
+{
+  struct __mm_store_sd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd(double *__dp, __m128d __a)
+{
+  *(__m128d*)__dp = __a;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+  _mm_store_pd(__dp, __a);
+}
+
+/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd1(double *__dp, __m128d __a)
+{
+  return _mm_store1_pd(__dp, __a);
+}
+
+/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+  struct __storeu_pd {
+    __m128d __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pd*)__dp)->__v = __a;
+}
+
+/// \brief Stores two double-precision values, in reverse order, from a 128-bit
+///    vector of [2 x double] to a 16-byte aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to a shuffling instruction followed by a
+/// <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 16-byte aligned memory location that can store two
+///    double-precision values.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be reversed and
+///    stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
+  *(__m128d *)__dp = __a;
+}
+
+/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+  struct __mm_storeh_pd_struct {
+    double __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
+///    saving the lower 8 bits of each sum in the corresponding element of a
+///    128-bit result vector of [16 x i8]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qu)__a + (__v16qu)__b);
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
+///    saving the lower 16 bits of each sum in the corresponding element of a
+///    128-bit result vector of [8 x i16]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit vector of [8 x i16] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hu)__a + (__v8hu)__b);
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
+///    saving the lower 32 bits of each sum in the corresponding element of a
+///    128-bit result vector of [4 x i32]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4su)__a + (__v4su)__b);
+}
+
+/// \brief Adds two signed or unsigned 64-bit integer values, returning the
+///    lower 64 bits of the sum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer.
+/// \param __b
+///    A 64-bit integer.
+/// \returns A 64-bit integer containing the sum of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+}
+
+/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
+///    saving the lower 64 bits of each sum in the corresponding element of a
+///    128-bit result vector of [2 x i64]. The integer elements of both
+///    parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+/// \returns A 128-bit vector of [2 x i64] containing the sums of both
+///    parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a + (__v2du)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [16 x i8] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
+///    saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [16 x i8] vector.
+/// \param __b
+///    A 128-bit signed [16 x i8] vector.
+/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
+///    both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    signed [8 x i16] vectors, saving each sum in the corresponding element of
+///    a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
+///    are saturated to 7FFFh. Negative sums less than 8000h are saturated to
+///    8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
+///    both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
+///    are saturated to FFh. Negative sums are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
+///    of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Adds, with saturation, the corresponding elements of two 128-bit
+///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
+///    are saturated to FFFFh. Negative sums are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
+///    of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [16 x i8] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
+///    averages of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Computes the rounded avarages of corresponding elements of two
+///    128-bit unsigned [8 x i16] vectors, saving each result in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
+///    averages of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, producing eight intermediate 32-bit signed integer products, and
+///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
+///    [4 x i32] vector. For example, bits [15:0] of both parameters are
+///    multiplied producing a 32-bit product, bits [31:16] of both parameters
+///    are multiplied producing a 32-bit product, and the sum of those two
+///    products becomes bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
+///    of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the greater value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
+///    vectors, saving the smaller value from each comparison in the
+///    corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+///    A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
+///    each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
+///    each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
+///    vectors, saving the upper 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
+///
+/// \param __a
+///    A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+///    A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
+///    of each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies the corresponding elements of two signed [8 x i16]
+///    vectors, saving the lower 16 bits of each 32-bit product in the
+///    corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit signed [8 x i16] vector.
+/// \param __b
+///    A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
+///    each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hu)__a * (__v8hu)__b);
+}
+
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
+///    of the two 64-bit integer vectors and returns the 64-bit unsigned
+///    product.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer containing one of the source operands.
+/// \param __b
+///    A 64-bit integer containing one of the source operands.
+/// \returns A 64-bit integer vector containing the product of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower
+///    bits of the corresponding elements of two [2 x i64] vectors, and returns
+///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
+///
+/// \param __a
+///    A [2 x i64] vector containing one of the source operands.
+/// \param __b
+///    A [2 x i64] vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the product of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Computes the absolute differences of corresponding 8-bit integer
+///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
+///    separately sums the second 8 absolute differences. Packss these two
+///    unsigned 16-bit integer sums into the upper and lower elements of a
+///    [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the sums of the sets of absolute
+///    differences between both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Subtracts the corresponding 8-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qu)__a - (__v16qu)__b);
+}
+
+/// \brief Subtracts the corresponding 16-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hu)__a - (__v8hu)__b);
+}
+
+/// \brief Subtracts the corresponding 32-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4su)__a - (__v4su)__b);
+}
+
+/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
+///    difference to the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the minuend.
+/// \param __b
+///    A 64-bit integer vector containing the subtrahend.
+/// \returns A 64-bit integer vector containing the difference of the values in
+///    the operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+}
+
+/// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a - (__v2du)__b);
+}
+
+/// \brief Subtracts corresponding 8-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7Fh are saturated to 7Fh, and differences less
+///    than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Subtracts corresponding 16-bit signed integer values in the input and
+///    returns the differences in the corresponding bytes in the destination.
+///    Differences greater than 7FFFh are saturated to 7FFFh, and values less
+///    than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+///    in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Subtracts corresponding 8-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 00h are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Subtracts corresponding 16-bit unsigned integer values in the input
+///    and returns the differences in the corresponding bytes in the
+///    destination. Differences less than 0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+///    differences of the values in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise AND of the values
+///    in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a & (__v2du)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
+///    one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing the left source operand. The one's complement
+///    of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector containing the right source operand.
+/// \returns A 128-bit integer vector containing the bitwise AND of the one's
+///    complement of the first operand and the values in the second operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)(~(__v2du)__a & (__v2du)__b);
+}
+/// \brief Performs a bitwise OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise OR of the values
+///    in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a | (__v2du)__b);
+}
+
+/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
+///    values in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v2du)__a ^ (__v2du)__b);
+}
+
+/// \brief Left-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_slli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to left-shift operand
+///    \a a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
+#define _mm_slli_si128(a, imm) __extension__ ({                              \
+  (__m128i)__builtin_shufflevector(                                          \
+                                 (__v16qi)_mm_setzero_si128(),               \
+                                 (__v16qi)(__m128i)(a),                      \
+                                 ((char)(imm)&0xF0) ?  0 : 16 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  1 : 17 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  2 : 18 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  3 : 19 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  4 : 20 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  5 : 21 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  6 : 22 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  7 : 23 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  8 : 24 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ?  9 : 25 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
+                                 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
+
+#define _mm_bslli_si128(a, imm) \
+  _mm_slli_si128((a), (imm))
+
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psllqi128((__v2di)__a, __count);
+}
+
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
+}
+
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+/// \brief Right-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_srli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to right-shift operand
+///    \a a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
+#define _mm_srli_si128(a, imm) __extension__ ({                              \
+  (__m128i)__builtin_shufflevector(                                          \
+                                 (__v16qi)(__m128i)(a),                      \
+                                 (__v16qi)_mm_setzero_si128(),               \
+                                 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0,  \
+                                 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1,  \
+                                 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2,  \
+                                 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3,  \
+                                 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4,  \
+                                 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5,  \
+                                 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6,  \
+                                 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7,  \
+                                 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8,  \
+                                 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9,  \
+                                 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
+                                 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
+                                 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
+                                 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
+                                 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
+                                 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
+
+#define _mm_bsrli_si128(a, imm) \
+  _mm_srli_si128((a), (imm))
+
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi16(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi32(__m128i __a, int __count)
+{
+  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to right-shift each value
+///    in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi64(__m128i __a, int __count)
+{
+  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
+}
+
+/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits [63:0] specify the number of bits
+///    to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
+}
+
+/// \brief Compares each of the corresponding 8-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+/// \brief Compares each of the corresponding 16-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false, FFFFh
+///    for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit values of the 128-bit
+///    integer vectors for equality. Each comparison yields 0h for false,
+///    FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are
+///    greater than those in the second operand. Each comparison yields 0h for
+///    false, FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+  /* This function always performs a signed comparison, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are greater than those in the second operand. Each comparison yields 0h
+///    for false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are less
+///    than those in the second operand. Each comparison yields 0h for false,
+///    FFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi8(__b, __a);
+}
+
+/// \brief Compares each of the corresponding signed 16-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi16(__b, __a);
+}
+
+/// \brief Compares each of the corresponding signed 32-bit values of the
+///    128-bit integer vectors to determine if the values in the first operand
+///    are less than those in the second operand. Each comparison yields 0h for
+///    false, FFFFFFFFh for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+  return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+/// \brief Converts a 64-bit signed integer value from the second operand into a
+///    double-precision value and returns it in the lower element of a [2 x
+///    double] vector; the upper element of the returned vector is copied from
+///    the upper element of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
+///    copied to the upper 64 bits of the destination.
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied from
+///    the upper 64 bits of the first operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, according to the current rounding mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvtsd2si64((__v2df)__a);
+}
+
+/// \brief Converts the first (lower) element of a vector of [2 x double] into a
+///    64-bit signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+///    conversion.
+/// \returns A 64-bit signed integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_si64(__m128d __a)
+{
+  return __builtin_ia32_cvttsd2si64((__v2df)__a);
+}
+#endif
+
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi32_ps(__m128i __a)
+{
+  return __builtin_ia32_cvtdq2ps((__v4si)__a);
+}
+
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit integer vector of [4 x i32] containing the converted
+///    values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
+}
+
+/// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
+///    truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x i32] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi32(__m128 __a)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
+}
+
+/// \brief Returns a vector of [4 x i32] where the lowest element is the input
+///    operand and the remaining elements are zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+///    A 32-bit signed integer operand.
+/// \returns A 128-bit vector of [4 x i32].
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si128(int __a)
+{
+  return (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+/// \brief Returns a vector of [2 x i64] where the lower element is the input
+///    operand and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x i64] containing the converted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si128(long long __a)
+{
+  return (__m128i){ __a, 0 };
+}
+#endif
+
+/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
+///    32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+///    A vector of [4 x i32]. The least significant 32 bits are moved to the
+///    destination.
+/// \returns A 32-bit signed integer containing the moved value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si32(__m128i __a)
+{
+  __v4si __b = (__v4si)__a;
+  return __b[0];
+}
+
+#ifdef __x86_64__
+/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
+///    64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A vector of [2 x i64]. The least significant 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit signed integer containing the moved value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si64(__m128i __a)
+{
+  return __a[0];
+}
+#endif
+
+/// \brief Moves packed integer values from an aligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
+///
+/// \param __p
+///    An aligned pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_load_si128(__m128i const *__p)
+{
+  return *__p;
+}
+
+/// \brief Moves packed integer values from an unaligned 128-bit memory location
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si128(__m128i const *__p)
+{
+  struct __loadu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_si128*)__p)->__v;
+}
+
+/// \brief Returns a vector of [2 x i64] where the lower element is taken from
+///    the lower element of the operand, and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __p
+///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
+///    the destination.
+/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
+///    moved value. The higher order bits are cleared.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadl_epi64(__m128i const *__p)
+{
+  struct __mm_loadl_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
+///    This could be used as an argument to another intrinsic function where the
+///    argument is required but the value is not actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x i32] with unspecified content.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_undefined_si128(void)
+{
+  return (__m128i)__builtin_ia32_undef128();
+}
+
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64x(long long __q1, long long __q0)
+{
+  return (__m128i){ __q0, __q1 };
+}
+
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits of the
+///    destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits of the
+///    destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64(__m64 __q1, __m64 __q0)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [127:96] of the
+///    destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [95:64] of the destination
+///    vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of the destination
+///    vector.
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w7
+///    A 16-bit integer value used to initialize bits [127:112] of the
+///    destination vector.
+/// \param __w6
+///    A 16-bit integer value used to initialize bits [111:96] of the
+///    destination vector.
+/// \param __w5
+///    A 16-bit integer value used to initialize bits [95:80] of the destination
+///    vector.
+/// \param __w4
+///    A 16-bit integer value used to initialize bits [79:64] of the destination
+///    vector.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of the destination
+///    vector.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of the destination
+///    vector.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of the destination
+///    vector.
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the destination
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b15
+///    Initializes bits [127:120] of the destination vector.
+/// \param __b14
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b13
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b12
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b11
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b10
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b9
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b8
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 128-bit vector of [16 x i8] containing the values
+///    provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+/// \brief Initializes both values in a 128-bit integer vector with the
+///    specified 64-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    Integer value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit integer vector of [2 x i64] with both
+///    elements containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64x(long long __q)
+{
+  return (__m128i){ __q, __q };
+}
+
+/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
+///    specified 64-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __q
+///    A 64-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [2 x i64] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64(__m64 __q)
+{
+  return (__m128i){ (long long)__q, (long long)__q };
+}
+
+/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
+///    specified 32-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i
+///    A 32-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [4 x i32] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi32(int __i)
+{
+  return (__m128i)(__v4si){ __i, __i, __i, __i };
+}
+
+/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
+///    specified 16-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w
+///    A 16-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [8 x i16] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi16(short __w)
+{
+  return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
+///    specified 8-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b
+///    An 8-bit value used to initialize the elements of the destination integer
+///    vector.
+/// \returns An initialized 128-bit vector of [16 x i8] with all elements
+///    containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi8(char __b)
+{
+  return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+///   instruction.
+///
+/// \param __q0
+///    A 64-bit integral value used to initialize the lower 64 bits of the
+///    result.
+/// \param __q1
+///    A 64-bit integral value used to initialize the upper 64 bits of the
+///    result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi64(__m64 __q0, __m64 __q1)
+{
+  return (__m128i){ (long long)__q0, (long long)__q1 };
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
+{
+  return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
+{
+  return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
+{
+  return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+/// \brief Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+///    zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_si128(void)
+{
+  return (__m128i){ 0LL, 0LL };
+}
+
+/// \brief Stores a 128-bit integer vector to a memory location aligned on a
+///    128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that will receive the integer
+///    values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+  *__p = __b;
+}
+
+/// \brief Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si128(__m128i *__p, __m128i __b)
+{
+  struct __storeu_si128 {
+    __m128i __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si128*)__p)->__v = __b;
+}
+
+/// \brief Moves bytes selected by the mask from the first operand to the
+///    specified unaligned memory location. When a mask bit is 1, the
+///    corresponding byte is written, otherwise it is not written. To minimize
+///    caching, the date is flagged as non-temporal (unlikely to be used again
+///    soon). Exception and trap behavior for elements not selected for storage
+///    to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
+///   instruction.
+///
+/// \param __d
+///    A 128-bit integer vector containing the values to be moved.
+/// \param __n
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each byte represents the mask bits.
+/// \param __p
+///    A pointer to an unaligned 128-bit memory location where the specified
+///    values are moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+///    a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the lower 64 bits
+///    of the integer vector parameter.
+/// \param __a
+///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+///    value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_epi64(__m128i *__p, __m128i __a)
+{
+  struct __mm_storel_epi64_struct {
+    long long __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A vector of [2 x double] containing the 64-bit values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pd(double *__p, __m128d __a)
+{
+  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
+}
+
+/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A 128-bit integer vector containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
+}
+
+/// \brief Stores a 32-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
+///
+/// \param __p
+///    A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+///    A 32-bit integer containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si32(int *__p, int __a)
+{
+  __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+/// \brief Stores a 64-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
+///
+/// \param __p
+///    A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si64(long long *__p, long long __a)
+{
+  __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief The cache line containing \a __p is flushed and invalidated from all
+///    caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    flushed.
+void _mm_clflush(void const *);
+
+/// \brief Forces strong memory ordering (serialization) between load
+///    instructions preceding this instruction and load instructions following
+///    this instruction, ensuring the system completes all previous loads before
+///    executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
+///
+void _mm_lfence(void);
+
+/// \brief Forces strong memory ordering (serialization) between load and store
+///    instructions preceding this instruction and load and store instructions
+///    following this instruction, ensuring that the system completes all
+///    previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
+///
+void _mm_mfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
+///
+/// \param __a
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the lower 64 bits of the result.
+/// \param __b
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+///    the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __imm
+///    An immediate value. Bits [3:0] selects values from \a __a to be assigned
+///    to bits[15:0] of the result. \n
+///    000: assign values from bits [15:0] of \a __a. \n
+///    001: assign values from bits [31:16] of \a __a. \n
+///    010: assign values from bits [47:32] of \a __a. \n
+///    011: assign values from bits [63:48] of \a __a. \n
+///    100: assign values from bits [79:64] of \a __a. \n
+///    101: assign values from bits [95:80] of \a __a. \n
+///    110: assign values from bits [111:96] of \a __a. \n
+///    111: assign values from bits [127:112] of \a __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_extract_epi16(__m128i __a, int __imm)
+{
+  __v8hi __b = (__v8hi)__a;
+  return (unsigned short)__b[__imm & 7];
+}
+
+/// \brief Constructs a 128-bit integer vector by first making a copy of the
+///    128-bit integer vector parameter, and then inserting the lower 16 bits
+///    of an integer parameter into an offset specified by the immediate-value
+///    parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
+///    result and then one of the eight elements in the result is replaced by
+///    the lower 16 bits of \a __b.
+/// \param __b
+///    An integer. The lower 16 bits of this parameter are written to the
+///    result beginning at an offset specified by \a __imm.
+/// \param __imm
+///    An immediate value specifying the bit offset in the result at which the
+///    lower 16 bits of \a __b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_epi16(__m128i __a, int __b, int __imm)
+{
+  __v8hi __c = (__v8hi)__a;
+  __c[__imm & 7] = __b;
+  return (__m128i)__c;
+}
+
+/// \brief Copies the values of the most significant bits from each 8-bit
+///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+///    value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in \a __a,
+///    written to bits [15:0]. The other bits are assigned zeros.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_epi8(__m128i __a)
+{
+  return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
+///    elements of a 128-bit integer vector parameter, using the immediate-value
+///    parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a. The destinations within the 128-bit destination are assigned
+///    values as follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
+///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
+///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
+///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [31:0] of \a a. \n
+///    01: assign values from bits [63:32] of \a a. \n
+///    10: assign values from bits [95:64] of \a a. \n
+///    11: assign values from bits [127:96] of \a a.
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shuffle_epi32(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
+                                   (__v4si)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
+
+/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+///    [127:64] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
+///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
+///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
+///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [15:0] of \a a. \n
+///    01: assign values from bits [31:16] of \a a. \n
+///    10: assign values from bits [47:32] of \a a. \n
+///    11: assign values from bits [63:48] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_undefined_si128(), \
+                                   ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
+                                   ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
+                                   4, 5, 6, 7); })
+
+/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+///    [63:0] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from \a a. \n
+///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
+///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
+///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
+///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
+///    Bit value assignments: \n
+///    00: assign values from bits [79:64] of \a a. \n
+///    01: assign values from bits [95:80] of \a a. \n
+///    10: assign values from bits [111:96] of \a a. \n
+///    11: assign values from bits [127:112] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
+                                   (__v8hi)_mm_undefined_si128(), \
+                                   0, 1, 2, 3, \
+                                   4 + (((imm) >> 0) & 0x3), \
+                                   4 + (((imm) >> 2) & 0x3), \
+                                   4 + (((imm) >> 4) & 0x3), \
+                                   4 + (((imm) >> 6) & 0x3)); })
+
+/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
+///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [7:0] of the result. \n
+///    Bits [79:72] are written to bits [23:16] of the result. \n
+///    Bits [87:80] are written to bits [39:32] of the result. \n
+///    Bits [95:88] are written to bits [55:48] of the result. \n
+///    Bits [103:96] are written to bits [71:64] of the result. \n
+///    Bits [111:104] are written to bits [87:80] of the result. \n
+///    Bits [119:112] are written to bits [103:96] of the result. \n
+///    Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [71:64] are written to bits [15:8] of the result. \n
+///    Bits [79:72] are written to bits [31:24] of the result. \n
+///    Bits [87:80] are written to bits [47:40] of the result. \n
+///    Bits [95:88] are written to bits [63:56] of the result. \n
+///    Bits [103:96] are written to bits [79:72] of the result. \n
+///    Bits [111:104] are written to bits [95:88] of the result. \n
+///    Bits [119:112] are written to bits [111:104] of the result. \n
+///    Bits [127:120] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [15:0] of the result. \n
+///    Bits [95:80] are written to bits [47:32] of the result. \n
+///    Bits [111:96] are written to bits [79:64] of the result. \n
+///    Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [31:16] of the result. \n
+///    Bits [95:80] are written to bits [63:48] of the result. \n
+///    Bits [111:96] are written to bits [95:80] of the result. \n
+///    Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [95:64] are written to bits [64:32] of the destination. \n
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
+}
+
+/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8]. \n
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
+///    Bits [31:24] are written to bits [55:48] of the result. \n
+///    Bits [39:32] are written to bits [71:64] of the result. \n
+///    Bits [47:40] are written to bits [87:80] of the result. \n
+///    Bits [55:48] are written to bits [103:96] of the result. \n
+///    Bits [63:56] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
+///    Bits [31:24] are written to bits [63:56] of the result. \n
+///    Bits [39:32] are written to bits [79:72] of the result. \n
+///    Bits [47:40] are written to bits [95:88] of the result. \n
+///    Bits [55:48] are written to bits [111:104] of the result. \n
+///    Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
+///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
+///    [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result. \n
+///    Bits [31:16] are written to bits [47:32] of the result. \n
+///    Bits [47:32] are written to bits [79:64] of the result. \n
+///    Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result. \n
+///    Bits [31:16] are written to bits [63:48] of the result. \n
+///    Bits [47:32] are written to bits [95:80] of the result. \n
+///    Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination. \n
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. \n
+///    Bits [31:0] are written to bits [64:32] of the destination. \n
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination. \n
+/// \param __b
+///    A 128-bit vector of [2 x i64]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination. \n
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
+}
+
+/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+///    integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_movepi64_pi64(__m128i __a)
+{
+  return (__m64)__a[0];
+}
+
+/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+///    upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction.
+///
+/// \param __a
+///    A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movpi64_epi64(__m64 __a)
+{
+  return (__m128i){ (long long)__a, 0 };
+}
+
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+///    integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_move_epi64(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
+}
+
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
+}
+
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double]. \n
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
+}
+
+/// \brief Extracts the sign bits of the double-precision values in the 128-bit
+///    vector of [2 x double], zero-extends the value, and writes it to the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values with sign bits to
+///    be extracted.
+/// \returns The sign bits from each of the double-precision elements in \a __a,
+///    written to bits [1:0]. The remaining bits are assigned values of zero.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pd(__m128d __a)
+{
+  return __builtin_ia32_movmskpd((__v2df)__a);
+}
+
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
+///    128-bit vector parameters of [2 x double], using the immediate-value
+///     parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param i
+///    An 8-bit immediate value. The least significant two bits specify which
+///    elements to copy from a and b: \n
+///    Bit[0] = 0: lower element of a copied to lower element of result. \n
+///    Bit[0] = 1: upper element of a copied to lower element of result. \n
+///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
+///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
+#define _mm_shuffle_pd(a, b, i) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                   0 + (((i) >> 0) & 0x1), \
+                                   2 + (((i) >> 1) & 0x1)); })
+
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castpd_ps(__m128d __a)
+{
+  return (__m128)__a;
+}
+
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castpd_si128(__m128d __a)
+{
+  return (__m128i)__a;
+}
+
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castps_pd(__m128 __a)
+{
+  return (__m128d)__a;
+}
+
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castps_si128(__m128 __a)
+{
+  return (__m128i)__a;
+}
+
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castsi128_ps(__m128i __a)
+{
+  return (__m128)__a;
+}
+
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castsi128_pd(__m128i __a)
+{
+  return (__m128d)__a;
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief Indicates that a spin loop is being executed for the purposes of
+///    optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
+///
+void _mm_pause(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#undef __DEFAULT_FN_ATTRS
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#endif /* __EMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/f16cintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/f16cintrin.h
new file mode 100644
index 000000000..180712ffc
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/f16cintrin.h
@@ -0,0 +1,124 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
+#endif
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+
+/// \brief Converts a 16-bit half-precision float value into a 32-bit float
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 16-bit half-precision float value.
+/// \returns The converted 32-bit float value.
+static __inline float __DEFAULT_FN_ATTRS
+_cvtsh_ss(unsigned short __a)
+{
+  __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
+  __v4sf r = __builtin_ia32_vcvtph2ps(v);
+  return r[0];
+}
+
+/// \brief Converts a 32-bit single-precision float value to a 16-bit
+///    half-precision float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _cvtss_sh(float a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 32-bit single-precision float value to be converted to a 16-bit
+///    half-precision float value.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns The converted 16-bit half-precision float value.
+#define _cvtss_sh(a, imm)  \
+  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                      (imm)))[0]))
+
+/// \brief Converts a 128-bit vector containing 32-bit float values into a
+///    128-bit vector containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 128-bit vector containing 32-bit float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing converted 16-bit half-precision float
+///    values. The lower 64 bits are used to store the converted 16-bit
+///    half-precision floating-point values.
+#define _mm_cvtps_ph(a, imm) \
+  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
+
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 128-bit vector containing 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values. The lower
+///    64 bits are used in the conversion.
+/// \returns A 128-bit vector of [4 x float] containing converted float values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __F16CINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/float.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/float.h
new file mode 100644
index 000000000..0f453d87c
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/float.h
@@ -0,0 +1,137 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ *
+ * Also fall back on Darwin to allow additional definitions and
+ * implementation-defined values.
+ */
+#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
+    __STDC_HOSTED__ && __has_include_next(<float.h>)
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#    undef DECIMAL_DIG
+#  endif
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#    undef FLT_DECIMAL_DIG
+#    undef DBL_DECIMAL_DIG
+#    undef LDBL_DECIMAL_DIG
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#  define DECIMAL_DIG __DECIMAL_DIG__
+#endif
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
+#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
+#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
+#endif
+
+#endif /* __FLOAT_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fma4intrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fma4intrin.h
new file mode 100644
index 000000000..11aa8ceac
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fma4intrin.h
@@ -0,0 +1,230 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMA4INTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fmaintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fmaintrin.h
new file mode 100644
index 000000000..0e2ef0b17
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fmaintrin.h
@@ -0,0 +1,228 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMAINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fxsrintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fxsrintrin.h
new file mode 100644
index 000000000..786081ca8
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/fxsrintrin.h
@@ -0,0 +1,105 @@
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
+
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p)
+{
+  return __builtin_ia32_fxsave(__p);
+}
+
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p)
+{
+  return __builtin_ia32_fxrstor(__p);
+}
+
+#ifdef __x86_64__
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p)
+{
+  return __builtin_ia32_fxsave64(__p);
+}
+
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p)
+{
+  return __builtin_ia32_fxrstor64(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/htmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/htmintrin.h
new file mode 100644
index 000000000..69c8d7bb5
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/htmintrin.h
@@ -0,0 +1,226 @@
+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMINTRIN_H
+#define __HTMINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#ifdef __powerpc__
+
+#include <stdint.h>
+
+typedef uint64_t texasr_t;
+typedef uint32_t texasru_t;
+typedef uint32_t texasrl_t;
+typedef uintptr_t tfiar_t;
+typedef uintptr_t tfhar_t;
+
+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
+#define _HTM_NONTRANSACTIONAL 0x0
+#define _HTM_SUSPENDED        0x1
+#define _HTM_TRANSACTIONAL    0x2
+
+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+
+#define _TEXASR_FAILURE_CODE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
+
+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _TEXASR_DISALLOWED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
+#define _TEXASRU_DISALLOWED(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
+
+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
+
+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
+
+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
+
+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
+
+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
+
+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
+
+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
+
+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
+
+#define _TEXASR_ABORT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
+#define _TEXASRU_ABORT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
+
+
+#define _TEXASR_SUSPENDED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
+
+#define _TEXASR_PRIVILEGE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
+
+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
+
+#define _TEXASR_TFIAR_EXACT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
+
+#define _TEXASR_ROT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
+
+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
+
+#endif /* __powerpc */
+
+#ifdef __s390__
+
+/* Condition codes generated by tbegin  */
+#define _HTM_TBEGIN_STARTED       0
+#define _HTM_TBEGIN_INDETERMINATE 1
+#define _HTM_TBEGIN_TRANSIENT     2
+#define _HTM_TBEGIN_PERSISTENT    3
+
+/* The abort codes below this threshold are reserved for machine use.  */
+#define _HTM_FIRST_USER_ABORT_CODE 256
+
+/* The transaction diagnostic block is it is defined in the Principles
+   of Operation chapter 5-91.  */
+
+struct __htm_tdb {
+  unsigned char format;                /*   0 */
+  unsigned char flags;
+  unsigned char reserved1[4];
+  unsigned short nesting_depth;
+  unsigned long long abort_code;       /*   8 */
+  unsigned long long conflict_token;   /*  16 */
+  unsigned long long atia;             /*  24 */
+  unsigned char eaid;                  /*  32 */
+  unsigned char dxc;
+  unsigned char reserved2[2];
+  unsigned int program_int_id;
+  unsigned long long exception_id;     /*  40 */
+  unsigned long long bea;              /*  48 */
+  unsigned char reserved3[72];         /*  56 */
+  unsigned long long gprs[16];         /* 128 */
+} __attribute__((__packed__, __aligned__ (8)));
+
+
+/* Helper intrinsics to retry tbegin in case of transient failure.  */
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_null (int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_tdb (void *__tdb, int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_null(retry) : \
+   __builtin_tbegin_retry_tdb(tdb, retry))
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_null (int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_tdb (void *__tdb, int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_nofloat_null(retry) : \
+   __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
+
+#endif /* __s390__ */
+
+#endif /* __HTMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/htmxlintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/htmxlintrin.h
new file mode 100644
index 000000000..16dc7056c
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/htmxlintrin.h
@@ -0,0 +1,363 @@
+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMXLINTRIN_H
+#define __HTMXLINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#include <htmintrin.h>
+
+#ifdef __powerpc__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _TEXASR_PTR(TM_BUF) \
+  ((texasr_t *)((TM_BUF)+0))
+#define _TEXASRU_PTR(TM_BUF) \
+  ((texasru_t *)((TM_BUF)+0))
+#define _TEXASRL_PTR(TM_BUF) \
+  ((texasrl_t *)((TM_BUF)+4))
+#define _TFIAR_PTR(TM_BUF) \
+  ((tfiar_t *)((TM_BUF)+8))
+
+typedef char TM_buff_type[16];
+
+/* This macro can be used to determine whether a transaction was successfully
+   started from the __TM_begin() and __TM_simple_begin() intrinsic functions
+   below.  */
+#define _HTM_TBEGIN_STARTED     1
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_simple_begin (void)
+{
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_begin (void* const __TM_buff)
+{
+  *_TEXASRL_PTR (__TM_buff) = 0;
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+#ifdef __powerpc64__
+  *_TEXASR_PTR (__TM_buff) = __builtin_get_texasr ();
+#else
+  *_TEXASRU_PTR (__TM_buff) = __builtin_get_texasru ();
+  *_TEXASRL_PTR (__TM_buff) = __builtin_get_texasr ();
+#endif
+  *_TFIAR_PTR (__TM_buff) = __builtin_get_tfiar ();
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_end (void)
+{
+  if (__builtin_expect (__builtin_tend (0), 1))
+    return 1;
+  return 0;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_abort (void)
+{
+  __builtin_tabort (0);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_named_abort (unsigned char const __code)
+{
+  __builtin_tabort (__code);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_resume (void)
+{
+  __builtin_tresume ();
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_suspend (void)
+{
+  __builtin_tsuspend ();
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_user_abort (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_named_user_abort (void* const __TM_buff, unsigned char *__code)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+
+  *__code = _TEXASRU_FAILURE_CODE (texasru);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_illegal (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_DISALLOWED (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_footprint_exceeded (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_nesting_depth (void* const __TM_buff)
+{
+  texasrl_t texasrl;
+
+  if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
+    {
+      texasrl = *_TEXASRL_PTR (__TM_buff);
+      if (!_TEXASR_FAILURE_SUMMARY (texasrl))
+        texasrl = 0;
+    }
+  else
+    texasrl = (texasrl_t) __builtin_get_texasr ();
+
+  return _TEXASR_TRANSACTION_LEVEL (texasrl);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_nested_too_deep(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_NESTING_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_conflict(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  /* Return TEXASR bits 11 (Self-Induced Conflict) through
+     14 (Translation Invalidation Conflict).  */
+  return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_failure_persistent(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_FAILURE_PERSISTENT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_address(void* const __TM_buff)
+{
+  return *_TFIAR_PTR (__TM_buff);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_code(void* const __TM_buff)
+{
+  return *_TEXASR_PTR (__TM_buff);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __powerpc__ */
+
+#ifdef __s390__
+
+#include <stdint.h>
+
+/* These intrinsics are being made available for compatibility with
+   the IBM XL compiler.  For documentation please see the "z/OS XL
+   C/C++ Programming Guide" publically available on the web.  */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_simple_begin ()
+{
+  return __builtin_tbegin_nofloat (0);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_begin (void* const __tdb)
+{
+  return __builtin_tbegin_nofloat (__tdb);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_end ()
+{
+  return __builtin_tend ();
+}
+
+static __inline void __attribute__((__always_inline__))
+__TM_abort ()
+{
+  return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_named_abort (unsigned char const __code)
+{
+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + __code);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_non_transactional_store (void* const __addr, long long const __value)
+{
+  __builtin_non_tx_store ((uint64_t*)__addr, (uint64_t)__value);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_nesting_depth (void* const __tdb_ptr)
+{
+  int depth = __builtin_tx_nesting_depth ();
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (depth != 0)
+    return depth;
+
+  if (tdb->format != 1)
+    return 0;
+  return tdb->nesting_depth;
+}
+
+/* Transaction failure diagnostics */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_user_abort (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_named_user_abort (void* const __tdb_ptr, unsigned char* __code)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
+    {
+      *__code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+      return 1;
+    }
+  return 0;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_illegal (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 4 /* unfiltered program interruption */
+	      || tdb->abort_code == 11 /* restricted instruction */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_footprint_exceeded (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 7 /* fetch overflow */
+	      || tdb->abort_code == 8 /* store overflow */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_nested_too_deep (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_conflict (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 9 /* fetch conflict */
+	      || tdb->abort_code == 10 /* store conflict */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_failure_persistent (long const __result)
+{
+  return __result == _HTM_TBEGIN_PERSISTENT;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_address (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+  return tdb->atia;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_code (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return tdb->abort_code;
+}
+
+#endif /* __s390__ */
+
+#endif /* __HTMXLINTRIN_H  */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/ia32intrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/ia32intrin.h
new file mode 100644
index 000000000..492830010
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/ia32intrin.h
@@ -0,0 +1,73 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __builtin_ia32_writeeflags_u64(__f);
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u32();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __builtin_ia32_writeeflags_u32(__f);
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#define _rdpmc(A) __rdpmc(A)
+
+#endif /* __IA32INTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/immintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/immintrin.h
new file mode 100644
index 000000000..7f91d49fb
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/immintrin.h
@@ -0,0 +1,318 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
+#include <mmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
+#include <pmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__SSE4_2__) || defined(__SSE4_1__))
+#include <smmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AES__) || defined(__PCLMUL__))
+#include <wmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
+#include <clflushoptintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
+#include <avxintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
+#include <avx2intrin.h>
+
+/* The 256-bit versions of functions in f16cintrin.h.
+   Intel documents these as being in immintrin.h, and
+   they depend on typedefs from avxintrin.h. */
+
+/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float values to be
+///    converted to 16-bit half-precision float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+///    float values.
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
+
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values to be
+///    converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
+static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+#endif /* __AVX2__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+#include <bmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
+#include <bmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
+#include <lzcntintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
+#include <fmaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
+#include <avx512fintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
+#include <avx512vlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
+#include <avx512bwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
+#include <avx512cdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
+#include <avx512dqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BW__))
+#include <avx512vlbwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512CD__))
+#include <avx512vlcdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512DQ__))
+#include <avx512vldqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
+#include <avx512erintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
+#include <avx512ifmaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
+#include <avx512ifmavlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
+#include <avx512vbmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
+#include <avx512vbmivlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
+#include <avx512pfintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
+#include <pkuintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+/* __bit_scan_forward */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_forward(int __A) {
+  return __builtin_ctz(__A);
+}
+
+/* __bit_scan_reverse */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_reverse(int __A) {
+  return 31 - __builtin_clz(__A);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
+#ifdef __x86_64__
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u32(void)
+{
+  return __builtin_ia32_rdfsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u64(void)
+{
+  return __builtin_ia32_rdfsbase64();
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u32(void)
+{
+  return __builtin_ia32_rdgsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u64(void)
+{
+  return __builtin_ia32_rdgsbase64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrfsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrfsbase64(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrgsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrgsbase64(__V);
+}
+
+#endif
+#endif /* __FSGSBASE__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
+#include <rtmintrin.h>
+#include <xtestintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
+#include <shaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
+#include <fxsrintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
+#include <xsaveintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
+#include <xsaveoptintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
+#include <xsavecintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
+#include <xsavesintrin.h>
+#endif
+
+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
+ * whereas others are also available at all times. */
+#include <adxintrin.h>
+
+#endif /* __IMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/intrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/intrin.h
new file mode 100644
index 000000000..a35262af8
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/intrin.h
@@ -0,0 +1,1021 @@
+/* ===-------- intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+#if defined(__arm__)
+#include <armintr.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+float _m_to_float(__m64);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+static __inline__
+__int64 __emul(int, int);
+static __inline__
+unsigned __int64 __emulu(unsigned int, unsigned int);
+void __cdecl __fastfail(unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+void __llwpcb(void *);
+unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
+void __lwpval32(unsigned int, unsigned int, unsigned int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+static __inline__
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+void *__slwpcb(void);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+void __ud2(void);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+static __inline__
+unsigned char _interlockedbittestandset(long volatile *, long);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
+void __lwpval64(unsigned __int64, unsigned int, unsigned int);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+static __inline__
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+static __inline__
+__int64 _mul128(__int64, __int64, __int64*);
+static __inline__
+unsigned __int64 _umul128(unsigned __int64,
+                          unsigned __int64,
+                          unsigned __int64*);
+
+#endif /* __x86_64__ */
+
+#if defined(__x86_64__) || defined(__arm__)
+
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest(long const *_BitBase, long _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
+  return (_PrevVal >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
+  return (_PrevVal >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
+  long long _PrevVal =
+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_acq(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_nf(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_rel(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_acq(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_nf(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_rel(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_acq(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_nf(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_rel(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_acq(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_nf(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_rel(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_acq(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_nf(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_rel(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_acq(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_nf(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_rel(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_acq(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_nf(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_rel(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_acq(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_nf(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_rel(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_acq(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_nf(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_rel(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_acq(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_nf(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_rel(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_acq(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_nf(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_rel(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_acq(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_nf(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_rel(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* readfs, readgs
+|* (Pointers in address space #256 and #257 are relative to the GS and FS
+|* segment registers, respectively.)
+\*----------------------------------------------------------------------------*/
+#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
+    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
+    (__offset))
+
+#ifdef __i386__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readfsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned char, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readfsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned short, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readfsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+__readgsbyte(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned char, __offset);
+}
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__readgsword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned short, __offset);
+}
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readgsdword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned long, __offset);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readgsqword(unsigned long __offset) {
+  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
+}
+#endif
+#undef __ptr_to_addr_space
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__nop(void) {
+  __asm__ volatile ("nop");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/inttypes.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/inttypes.h
new file mode 100644
index 000000000..1d8eabab0
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/inttypes.h
@@ -0,0 +1,106 @@
+/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_INTTYPES_H
+#define __CLANG_INTTYPES_H
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#error MSVC does not have inttypes.h prior to Visual Studio 2013
+#endif
+
+#include_next <inttypes.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+/* MSVC headers define int32_t as int, but PRIx32 as "lx" instead of "x".
+ * This triggers format warnings, so fix it up here. */
+#undef PRId32
+#undef PRIdLEAST32
+#undef PRIdFAST32
+#undef PRIi32
+#undef PRIiLEAST32
+#undef PRIiFAST32
+#undef PRIo32
+#undef PRIoLEAST32
+#undef PRIoFAST32
+#undef PRIu32
+#undef PRIuLEAST32
+#undef PRIuFAST32
+#undef PRIx32
+#undef PRIxLEAST32
+#undef PRIxFAST32
+#undef PRIX32
+#undef PRIXLEAST32
+#undef PRIXFAST32
+
+#undef SCNd32
+#undef SCNdLEAST32
+#undef SCNdFAST32
+#undef SCNi32
+#undef SCNiLEAST32
+#undef SCNiFAST32
+#undef SCNo32
+#undef SCNoLEAST32
+#undef SCNoFAST32
+#undef SCNu32
+#undef SCNuLEAST32
+#undef SCNuFAST32
+#undef SCNx32
+#undef SCNxLEAST32
+#undef SCNxFAST32
+
+#define PRId32 "d"
+#define PRIdLEAST32 "d"
+#define PRIdFAST32 "d"
+#define PRIi32 "i"
+#define PRIiLEAST32 "i"
+#define PRIiFAST32 "i"
+#define PRIo32 "o"
+#define PRIoLEAST32 "o"
+#define PRIoFAST32 "o"
+#define PRIu32 "u"
+#define PRIuLEAST32 "u"
+#define PRIuFAST32 "u"
+#define PRIx32 "x"
+#define PRIxLEAST32 "x"
+#define PRIxFAST32 "x"
+#define PRIX32 "X"
+#define PRIXLEAST32 "X"
+#define PRIXFAST32 "X"
+
+#define SCNd32 "d"
+#define SCNdLEAST32 "d"
+#define SCNdFAST32 "d"
+#define SCNi32 "i"
+#define SCNiLEAST32 "i"
+#define SCNiFAST32 "i"
+#define SCNo32 "o"
+#define SCNoLEAST32 "o"
+#define SCNoFAST32 "o"
+#define SCNu32 "u"
+#define SCNuLEAST32 "u"
+#define SCNuFAST32 "u"
+#define SCNx32 "x"
+#define SCNxLEAST32 "x"
+#define SCNxFAST32 "x"
+#endif
+
+#endif /* __CLANG_INTTYPES_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/iso646.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/iso646.h
new file mode 100644
index 000000000..dca13c5ba
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/iso646.h
@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/limits.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/limits.h
new file mode 100644
index 000000000..f04187ced
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/limits.h
@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/lzcntintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/lzcntintrin.h
new file mode 100644
index 000000000..3d2769da3
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/lzcntintrin.h
@@ -0,0 +1,118 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__lzcnt16(unsigned short __X)
+{
+  return __X ? __builtin_clzs(__X) : 16;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__lzcnt32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_lzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+#ifdef __x86_64__
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__lzcnt64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_lzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LZCNTINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mm3dnow.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mm3dnow.h
new file mode 100644
index 000000000..294866c1d
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mm3dnow.h
@@ -0,0 +1,171 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_femms(void) {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/* Handle the 3dnowa instructions here. */
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mm_malloc.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mm_malloc.h
new file mode 100644
index 000000000..305afd31a
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mm_malloc.h
@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mmintrin.h
new file mode 100644
index 000000000..e0c277a65
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mmintrin.h
@@ -0,0 +1,1549 @@
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+typedef long long __m64 __attribute__((__vector_size__(8)));
+
+typedef long long __v1di __attribute__((__vector_size__(8)));
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+
+/// \brief Clears the MMX state by setting the state of the x87 stack registers
+///    to empty.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EMMS </c> instruction.
+///
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_empty(void)
+{
+    __builtin_ia32_emms();
+}
+
+/// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the
+///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __i
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
+///    parameter. The upper 32 bits are set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si64(int __i)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+/// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
+///    signed integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector.
+/// \returns A 32-bit signed integer value containing the lower 32 bits of the
+///    parameter.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si32(__m64 __m)
+{
+    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+/// \brief Casts a 64-bit signed integer value into a 64-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
+///
+/// \param __i
+///    A 64-bit signed integer.
+/// \returns A 64-bit integer vector containing the same bitwise pattern as the
+///    parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_m64(long long __i)
+{
+    return (__m64)__i;
+}
+
+/// \brief Casts a 64-bit integer vector into a 64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector.
+/// \returns A 64-bit signed integer containing the same bitwise pattern as the
+///    parameter.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtm64_si64(__m64 __m)
+{
+    return (long long)__m;
+}
+
+/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
+///    a 64-bit integer vector of [8 x i8] as the result. Positive values
+///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
+///    are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit signed integer with
+///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80. The converted
+///    [4 x i8] values are written to the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit signed integer with
+///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80. The converted
+///    [4 x i8] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Converts 32-bit signed integers from both 64-bit integer vector
+///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
+///    a 64-bit integer vector of [4 x i16] as the result. Positive values
+///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
+///    0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+///    32-bit signed integer and is converted to a 16-bit signed integer with
+///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000. The converted
+///    [2 x i16] values are written to the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+///    32-bit signed integer and is converted to a 16-bit signed integer with
+///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000. The converted
+///    [2 x i16] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+///    parameters of [4 x i16] into 8-bit unsigned integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
+///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
+///    to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0 are saturated to 0. The converted [4 x i8] values are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+///    16-bit signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0 are saturated to 0. The converted [4 x i8] values are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
+///    and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8]. \n 
+///    Bits [39:32] are written to bits [7:0] of the result. \n
+///    Bits [47:40] are written to bits [23:16] of the result. \n
+///    Bits [55:48] are written to bits [39:32] of the result. \n
+///    Bits [63:56] are written to bits [55:48] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [39:32] are written to bits [15:8] of the result. \n
+///    Bits [47:40] are written to bits [31:24] of the result. \n
+///    Bits [55:48] are written to bits [47:40] of the result. \n
+///    Bits [63:56] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [47:32] are written to bits [15:0] of the result. \n
+///    Bits [63:48] are written to bits [47:32] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [47:32] are written to bits [31:16] of the result. \n
+///    Bits [63:48] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
+///    and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [7:0] are written to bits [7:0] of the result. \n
+///    Bits [15:8] are written to bits [23:16] of the result. \n
+///    Bits [23:16] are written to bits [39:32] of the result. \n
+///    Bits [31:24] are written to bits [55:48] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result. \n
+///    Bits [15:8] are written to bits [31:24] of the result. \n
+///    Bits [23:16] are written to bits [47:40] of the result. \n
+///    Bits [31:24] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result. \n
+///    Bits [31:16] are written to bits [47:32] of the result.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result. \n
+///    Bits [31:16] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+///    the lower 32 bits of the result.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+///    the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Adds each 8-bit integer element of the first 64-bit integer vector
+///    of [8 x i8] to the corresponding 8-bit integer element of the second
+///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
+///    packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Adds each 16-bit integer element of the first 64-bit integer vector
+///    of [4 x i16] to the corresponding 16-bit integer element of the second
+///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
+///    packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Adds each 32-bit integer element of the first 64-bit integer vector
+///    of [2 x i32] to the corresponding 32-bit integer element of the second
+///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
+///    packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Adds each 8-bit signed integer element of the first 64-bit integer
+///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
+///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
+///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
+///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
+///    of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Adds each 16-bit signed integer element of the first 64-bit integer
+///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
+///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
+///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
+///    saturated to 0x8000. The results are packed into a 64-bit integer vector
+///    of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
+///    of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer
+///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
+///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
+///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
+///    [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    unsigned sums of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer
+///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
+///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
+///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    unsigned sums of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Subtracts each 8-bit integer element of the second 64-bit integer
+///    vector of [8 x i8] from the corresponding 8-bit integer element of the
+///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
+///    are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
+///    both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Subtracts each 16-bit integer element of the second 64-bit integer
+///    vector of [4 x i16] from the corresponding 16-bit integer element of the
+///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
+///    results are packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
+///    both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Subtracts each 32-bit integer element of the second 64-bit integer
+///    vector of [2 x i32] from the corresponding 32-bit integer element of the
+///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
+///    results are packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
+/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
+///    both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Subtracts each 8-bit signed integer element of the second 64-bit
+///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
+///    element of the first 64-bit integer vector of [8 x i8]. Positive results
+///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
+///    are saturated to 0x80. The results are packed into a 64-bit integer
+///    vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Subtracts each 16-bit signed integer element of the second 64-bit
+///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
+///    element of the first 64-bit integer vector of [4 x i16]. Positive results
+///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
+///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
+///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
+///    element of the first 64-bit integer vector of [8 x i8]. If an element of
+///    the first vector is less than the corresponding element of the second
+///    vector, the result is saturated to 0. The results are packed into a
+///    64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
+///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
+///    integer element of the first 64-bit integer vector of [4 x i16]. If an
+///    element of the first vector is less than the corresponding element of the
+///    second vector, the result is saturated to 0. The results are packed into
+///    a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+///    differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16] and get four
+///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
+///    The lower 32 bits of these two sums are packed into a 64-bit integer
+///    vector of [2 x i32]. For example, bits [15:0] of both parameters are
+///    multiplied, bits [31:16] of both parameters are multiplied, and the sum
+///    of both results is written to bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
+///    products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
+///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
+///    of the products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
+///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
+///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
+///    of the products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Left-shifts each 16-bit signed integer element of the first
+///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
+///    of bits specified by the second parameter, which is a 64-bit integer. The
+///    lower 16 bits of the results are packed into a 64-bit integer vector of
+///    [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+/// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer
+///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    The lower 16 bits of the results are packed into a 64-bit integer vector
+///    of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+///    values. If \a __count is greater or equal to 16, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+}
+
+/// \brief Left-shifts each 32-bit signed integer element of the first
+///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
+///    of bits specified by the second parameter, which is a 64-bit integer. The
+///    lower 32 bits of the results are packed into a 64-bit integer vector of
+///    [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+/// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer
+///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    The lower 32 bits of the results are packed into a 64-bit integer vector
+///    of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+///    values. If \a __count is greater or equal to 32, the result is set to all
+///    0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+/// \brief Left-shifts the first 64-bit integer parameter by the number of bits
+///    specified by the second 64-bit integer parameter. The lower 64 bits of
+///    result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+///     \a __count is greater or equal to 64, the result is set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+}
+
+/// \brief Left-shifts the first parameter, which is a 64-bit integer, by the
+///    number of bits specified by the second parameter, which is a 32-bit
+///    integer. The lower 64 bits of result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+///     \a __count is greater or equal to 64, the result is set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [4 x i16], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are filled with the sign bit of the initial value of each 16-bit
+///    element. The 16-bit results are packed into a 64-bit integer vector of
+///    [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    High-order bits are filled with the sign bit of the initial value of each
+///    16-bit element. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [2 x i32], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are filled with the sign bit of the initial value of each 32-bit
+///    element. The 32-bit results are packed into a 64-bit integer vector of
+///    [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    High-order bits are filled with the sign bit of the initial value of each
+///    32-bit element. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [4 x i16], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are cleared. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
+///    integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [4 x i16].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi16(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of the first parameter,
+///    which is a 64-bit integer vector of [2 x i32], by the number of bits
+///    specified by the second parameter, which is a 64-bit integer. High-order
+///    bits are cleared. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
+///    integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector of [2 x i32].
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi32(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
+///    specified by the second 64-bit integer parameter. High-order bits are
+///    cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the right-shifted value.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+}
+
+/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
+///    number of bits specified by the second parameter, which is a 32-bit
+///    integer. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
+///
+/// \param __m
+///    A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+///    A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the right-shifted value.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_si64(__m64 __m, int __count)
+{
+    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+}
+
+/// \brief Performs a bitwise AND of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAND </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then
+///    performs a bitwise AND of the intermediate result and the second 64-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PANDN </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector. The one's complement of this parameter is used
+///    in the bitwise AND.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of the second
+///    parameter and the one's complement of the first parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Performs a bitwise OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POR </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise OR of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
+///    parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+}
+
+/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+///    [8 x i8] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+///    [4 x i16] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+///    [2 x i32] to determine if the element of the first vector is equal to the
+///    corresponding element of the second vector. The comparison yields 0 for
+///    false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+///    [8 x i8] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [8 x i8].
+/// \param __m2
+///    A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+///    [4 x i16] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [4 x i16].
+/// \param __m2
+///    A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+///    [2 x i32] to determine if the element of the first vector is greater than
+///    the corresponding element of the second vector. The comparison yields 0
+///    for false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector of [2 x i32].
+/// \param __m2
+///    A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+///    results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// \brief Constructs a 64-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 64-bit integer vector with all elements set to zero.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setzero_si64(void)
+{
+    return (__m64){ 0LL };
+}
+
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i1
+///    A 32-bit integer value used to initialize the upper 32 bits of the
+///    result.
+/// \param __i0
+///    A 32-bit integer value used to initialize the lower 32 bits of the
+///    result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi32(int __i1, int __i0)
+{
+    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __s3
+///    A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \param __s2
+///    A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __s1
+///    A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __s0
+///    A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+/// \brief Constructs a 64-bit integer vector initialized with the specified
+///    8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b7
+///    An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \param __b6
+///    An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b5
+///    An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b4
+///    An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b3
+///    An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b2
+///    An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b1
+///    An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b0
+///    An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+            char __b1, char __b0)
+{
+    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+                                               __b4, __b5, __b6, __b7);
+}
+
+/// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the
+///    32-bit integer vector elements set to the specified 32-bit integer
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param __i
+///    A 32-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [2 x i32].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi32(int __i)
+{
+    return _mm_set_pi32(__i, __i);
+}
+
+/// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the
+///    16-bit integer vector elements set to the specified 16-bit integer
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param __w
+///    A 16-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [4 x i16].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi16(short __w)
+{
+    return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+/// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the
+///    8-bit integer vector elements set to the specified 8-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW +
+///    PSHUFLW </c> instruction.
+///
+/// \param __b
+///    An 8-bit integer value used to initialize each vector element of the
+///    result.
+/// \returns An initialized 64-bit integer vector of [8 x i8].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi8(char __b)
+{
+    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integer value used to initialize the lower 32 bits of the
+///    result.
+/// \param __i1
+///    A 32-bit integer value used to initialize the upper 32 bits of the
+///    result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi32(int __i0, int __i1)
+{
+    return _mm_set_pi32(__i1, __i0);
+}
+
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+             char __b6, char __b7)
+{
+    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_from_int64 _mm_cvtsi64_m64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_to_int64 _mm_cvtm64_si64
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMINTRIN_H */
+
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/module.modulemap b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/module.modulemap
new file mode 100644
index 000000000..11ef2f902
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/module.modulemap
@@ -0,0 +1,166 @@
+/*===---- module.modulemap - intrinsics module map -------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+module _Builtin_intrinsics [system] [extern_c] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module acle {
+      header "arm_acle.h"
+      export *
+    }
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    textual header "f16cintrin.h"
+    textual header "avxintrin.h"
+    textual header "avx2intrin.h"
+    textual header "avx512fintrin.h"
+    textual header "avx512erintrin.h"
+    textual header "fmaintrin.h"
+
+    header "x86intrin.h"
+    textual header "bmiintrin.h"
+    textual header "bmi2intrin.h"
+    textual header "lzcntintrin.h"
+    textual header "xopintrin.h"
+    textual header "fma4intrin.h"
+    textual header "mwaitxintrin.h"
+
+    explicit module mm_malloc {
+      requires !freestanding
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      requires gnuinlineasm
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      header "mmintrin.h"
+    }
+
+    explicit module sse {
+      export mm_malloc
+      export mmx
+      export sse2 // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module popcnt {
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      header "mm3dnow.h"
+    }
+
+    explicit module aes_pclmul {
+      header "wmmintrin.h"
+      export aes
+      export pclmul
+    }
+
+    explicit module aes {
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+
+  explicit module systemz {
+    requires systemz
+    export *
+
+    header "s390intrin.h"
+
+    explicit module htm {
+      requires htm
+      header "htmintrin.h"
+      header "htmxlintrin.h"
+    }
+
+    explicit module zvector {
+      requires zvector, vx
+      header "vecintrin.h"
+    }
+  }
+}
+
+module _Builtin_stddef_max_align_t [system] [extern_c] {
+  header "__stddef_max_align_t.h"
+}
+
+module opencl_c {
+  requires opencl
+  header "opencl-c.h"
+}
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/msa.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/msa.h
new file mode 100644
index 000000000..da680f5ca
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/msa.h
@@ -0,0 +1,583 @@
+/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MSA_H
+#define _MSA_H 1
+
+#if defined(__mips_msa)
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
+
+#define __msa_sll_b __builtin_msa_sll_b
+#define __msa_sll_h __builtin_msa_sll_h
+#define __msa_sll_w __builtin_msa_sll_w
+#define __msa_sll_d __builtin_msa_sll_d
+#define __msa_slli_b __builtin_msa_slli_b
+#define __msa_slli_h __builtin_msa_slli_h
+#define __msa_slli_w __builtin_msa_slli_w
+#define __msa_slli_d __builtin_msa_slli_d
+#define __msa_sra_b __builtin_msa_sra_b
+#define __msa_sra_h __builtin_msa_sra_h
+#define __msa_sra_w __builtin_msa_sra_w
+#define __msa_sra_d __builtin_msa_sra_d
+#define __msa_srai_b __builtin_msa_srai_b
+#define __msa_srai_h __builtin_msa_srai_h
+#define __msa_srai_w __builtin_msa_srai_w
+#define __msa_srai_d __builtin_msa_srai_d
+#define __msa_srar_b __builtin_msa_srar_b
+#define __msa_srar_h __builtin_msa_srar_h
+#define __msa_srar_w __builtin_msa_srar_w
+#define __msa_srar_d __builtin_msa_srar_d
+#define __msa_srari_b __builtin_msa_srari_b
+#define __msa_srari_h __builtin_msa_srari_h
+#define __msa_srari_w __builtin_msa_srari_w
+#define __msa_srari_d __builtin_msa_srari_d
+#define __msa_srl_b __builtin_msa_srl_b
+#define __msa_srl_h __builtin_msa_srl_h
+#define __msa_srl_w __builtin_msa_srl_w
+#define __msa_srl_d __builtin_msa_srl_d
+#define __msa_srli_b __builtin_msa_srli_b
+#define __msa_srli_h __builtin_msa_srli_h
+#define __msa_srli_w __builtin_msa_srli_w
+#define __msa_srli_d __builtin_msa_srli_d
+#define __msa_srlr_b __builtin_msa_srlr_b
+#define __msa_srlr_h __builtin_msa_srlr_h
+#define __msa_srlr_w __builtin_msa_srlr_w
+#define __msa_srlr_d __builtin_msa_srlr_d
+#define __msa_srlri_b __builtin_msa_srlri_b
+#define __msa_srlri_h __builtin_msa_srlri_h
+#define __msa_srlri_w __builtin_msa_srlri_w
+#define __msa_srlri_d __builtin_msa_srlri_d
+#define __msa_bclr_b __builtin_msa_bclr_b
+#define __msa_bclr_h __builtin_msa_bclr_h
+#define __msa_bclr_w __builtin_msa_bclr_w
+#define __msa_bclr_d __builtin_msa_bclr_d
+#define __msa_bclri_b __builtin_msa_bclri_b
+#define __msa_bclri_h __builtin_msa_bclri_h
+#define __msa_bclri_w __builtin_msa_bclri_w
+#define __msa_bclri_d __builtin_msa_bclri_d
+#define __msa_bset_b __builtin_msa_bset_b
+#define __msa_bset_h __builtin_msa_bset_h
+#define __msa_bset_w __builtin_msa_bset_w
+#define __msa_bset_d __builtin_msa_bset_d
+#define __msa_bseti_b __builtin_msa_bseti_b
+#define __msa_bseti_h __builtin_msa_bseti_h
+#define __msa_bseti_w __builtin_msa_bseti_w
+#define __msa_bseti_d __builtin_msa_bseti_d
+#define __msa_bneg_b __builtin_msa_bneg_b
+#define __msa_bneg_h __builtin_msa_bneg_h
+#define __msa_bneg_w __builtin_msa_bneg_w
+#define __msa_bneg_d __builtin_msa_bneg_d
+#define __msa_bnegi_b __builtin_msa_bnegi_b
+#define __msa_bnegi_h __builtin_msa_bnegi_h
+#define __msa_bnegi_w __builtin_msa_bnegi_w
+#define __msa_bnegi_d __builtin_msa_bnegi_d
+#define __msa_binsl_b __builtin_msa_binsl_b
+#define __msa_binsl_h __builtin_msa_binsl_h
+#define __msa_binsl_w __builtin_msa_binsl_w
+#define __msa_binsl_d __builtin_msa_binsl_d
+#define __msa_binsli_b __builtin_msa_binsli_b
+#define __msa_binsli_h __builtin_msa_binsli_h
+#define __msa_binsli_w __builtin_msa_binsli_w
+#define __msa_binsli_d __builtin_msa_binsli_d
+#define __msa_binsr_b __builtin_msa_binsr_b
+#define __msa_binsr_h __builtin_msa_binsr_h
+#define __msa_binsr_w __builtin_msa_binsr_w
+#define __msa_binsr_d __builtin_msa_binsr_d
+#define __msa_binsri_b __builtin_msa_binsri_b
+#define __msa_binsri_h __builtin_msa_binsri_h
+#define __msa_binsri_w __builtin_msa_binsri_w
+#define __msa_binsri_d __builtin_msa_binsri_d
+#define __msa_addv_b __builtin_msa_addv_b
+#define __msa_addv_h __builtin_msa_addv_h
+#define __msa_addv_w __builtin_msa_addv_w
+#define __msa_addv_d __builtin_msa_addv_d
+#define __msa_addvi_b __builtin_msa_addvi_b
+#define __msa_addvi_h __builtin_msa_addvi_h
+#define __msa_addvi_w __builtin_msa_addvi_w
+#define __msa_addvi_d __builtin_msa_addvi_d
+#define __msa_subv_b __builtin_msa_subv_b
+#define __msa_subv_h __builtin_msa_subv_h
+#define __msa_subv_w __builtin_msa_subv_w
+#define __msa_subv_d __builtin_msa_subv_d
+#define __msa_subvi_b __builtin_msa_subvi_b
+#define __msa_subvi_h __builtin_msa_subvi_h
+#define __msa_subvi_w __builtin_msa_subvi_w
+#define __msa_subvi_d __builtin_msa_subvi_d
+#define __msa_max_s_b __builtin_msa_max_s_b
+#define __msa_max_s_h __builtin_msa_max_s_h
+#define __msa_max_s_w __builtin_msa_max_s_w
+#define __msa_max_s_d __builtin_msa_max_s_d
+#define __msa_maxi_s_b __builtin_msa_maxi_s_b
+#define __msa_maxi_s_h __builtin_msa_maxi_s_h
+#define __msa_maxi_s_w __builtin_msa_maxi_s_w
+#define __msa_maxi_s_d __builtin_msa_maxi_s_d
+#define __msa_max_u_b __builtin_msa_max_u_b
+#define __msa_max_u_h __builtin_msa_max_u_h
+#define __msa_max_u_w __builtin_msa_max_u_w
+#define __msa_max_u_d __builtin_msa_max_u_d
+#define __msa_maxi_u_b __builtin_msa_maxi_u_b
+#define __msa_maxi_u_h __builtin_msa_maxi_u_h
+#define __msa_maxi_u_w __builtin_msa_maxi_u_w
+#define __msa_maxi_u_d __builtin_msa_maxi_u_d
+#define __msa_min_s_b __builtin_msa_min_s_b
+#define __msa_min_s_h __builtin_msa_min_s_h
+#define __msa_min_s_w __builtin_msa_min_s_w
+#define __msa_min_s_d __builtin_msa_min_s_d
+#define __msa_mini_s_b __builtin_msa_mini_s_b
+#define __msa_mini_s_h __builtin_msa_mini_s_h
+#define __msa_mini_s_w __builtin_msa_mini_s_w
+#define __msa_mini_s_d __builtin_msa_mini_s_d
+#define __msa_min_u_b __builtin_msa_min_u_b
+#define __msa_min_u_h __builtin_msa_min_u_h
+#define __msa_min_u_w __builtin_msa_min_u_w
+#define __msa_min_u_d __builtin_msa_min_u_d
+#define __msa_mini_u_b __builtin_msa_mini_u_b
+#define __msa_mini_u_h __builtin_msa_mini_u_h
+#define __msa_mini_u_w __builtin_msa_mini_u_w
+#define __msa_mini_u_d __builtin_msa_mini_u_d
+#define __msa_max_a_b __builtin_msa_max_a_b
+#define __msa_max_a_h __builtin_msa_max_a_h
+#define __msa_max_a_w __builtin_msa_max_a_w
+#define __msa_max_a_d __builtin_msa_max_a_d
+#define __msa_min_a_b __builtin_msa_min_a_b
+#define __msa_min_a_h __builtin_msa_min_a_h
+#define __msa_min_a_w __builtin_msa_min_a_w
+#define __msa_min_a_d __builtin_msa_min_a_d
+#define __msa_ceq_b __builtin_msa_ceq_b
+#define __msa_ceq_h __builtin_msa_ceq_h
+#define __msa_ceq_w __builtin_msa_ceq_w
+#define __msa_ceq_d __builtin_msa_ceq_d
+#define __msa_ceqi_b __builtin_msa_ceqi_b
+#define __msa_ceqi_h __builtin_msa_ceqi_h
+#define __msa_ceqi_w __builtin_msa_ceqi_w
+#define __msa_ceqi_d __builtin_msa_ceqi_d
+#define __msa_clt_s_b __builtin_msa_clt_s_b
+#define __msa_clt_s_h __builtin_msa_clt_s_h
+#define __msa_clt_s_w __builtin_msa_clt_s_w
+#define __msa_clt_s_d __builtin_msa_clt_s_d
+#define __msa_clti_s_b __builtin_msa_clti_s_b
+#define __msa_clti_s_h __builtin_msa_clti_s_h
+#define __msa_clti_s_w __builtin_msa_clti_s_w
+#define __msa_clti_s_d __builtin_msa_clti_s_d
+#define __msa_clt_u_b __builtin_msa_clt_u_b
+#define __msa_clt_u_h __builtin_msa_clt_u_h
+#define __msa_clt_u_w __builtin_msa_clt_u_w
+#define __msa_clt_u_d __builtin_msa_clt_u_d
+#define __msa_clti_u_b __builtin_msa_clti_u_b
+#define __msa_clti_u_h __builtin_msa_clti_u_h
+#define __msa_clti_u_w __builtin_msa_clti_u_w
+#define __msa_clti_u_d __builtin_msa_clti_u_d
+#define __msa_cle_s_b __builtin_msa_cle_s_b
+#define __msa_cle_s_h __builtin_msa_cle_s_h
+#define __msa_cle_s_w __builtin_msa_cle_s_w
+#define __msa_cle_s_d __builtin_msa_cle_s_d
+#define __msa_clei_s_b __builtin_msa_clei_s_b
+#define __msa_clei_s_h __builtin_msa_clei_s_h
+#define __msa_clei_s_w __builtin_msa_clei_s_w
+#define __msa_clei_s_d __builtin_msa_clei_s_d
+#define __msa_cle_u_b __builtin_msa_cle_u_b
+#define __msa_cle_u_h __builtin_msa_cle_u_h
+#define __msa_cle_u_w __builtin_msa_cle_u_w
+#define __msa_cle_u_d __builtin_msa_cle_u_d
+#define __msa_clei_u_b __builtin_msa_clei_u_b
+#define __msa_clei_u_h __builtin_msa_clei_u_h
+#define __msa_clei_u_w __builtin_msa_clei_u_w
+#define __msa_clei_u_d __builtin_msa_clei_u_d
+#define __msa_ld_b __builtin_msa_ld_b
+#define __msa_ld_h __builtin_msa_ld_h
+#define __msa_ld_w __builtin_msa_ld_w
+#define __msa_ld_d __builtin_msa_ld_d
+#define __msa_st_b __builtin_msa_st_b
+#define __msa_st_h __builtin_msa_st_h
+#define __msa_st_w __builtin_msa_st_w
+#define __msa_st_d __builtin_msa_st_d
+#define __msa_sat_s_b __builtin_msa_sat_s_b
+#define __msa_sat_s_h __builtin_msa_sat_s_h
+#define __msa_sat_s_w __builtin_msa_sat_s_w
+#define __msa_sat_s_d __builtin_msa_sat_s_d
+#define __msa_sat_u_b __builtin_msa_sat_u_b
+#define __msa_sat_u_h __builtin_msa_sat_u_h
+#define __msa_sat_u_w __builtin_msa_sat_u_w
+#define __msa_sat_u_d __builtin_msa_sat_u_d
+#define __msa_add_a_b __builtin_msa_add_a_b
+#define __msa_add_a_h __builtin_msa_add_a_h
+#define __msa_add_a_w __builtin_msa_add_a_w
+#define __msa_add_a_d __builtin_msa_add_a_d
+#define __msa_adds_a_b __builtin_msa_adds_a_b
+#define __msa_adds_a_h __builtin_msa_adds_a_h
+#define __msa_adds_a_w __builtin_msa_adds_a_w
+#define __msa_adds_a_d __builtin_msa_adds_a_d
+#define __msa_adds_s_b __builtin_msa_adds_s_b
+#define __msa_adds_s_h __builtin_msa_adds_s_h
+#define __msa_adds_s_w __builtin_msa_adds_s_w
+#define __msa_adds_s_d __builtin_msa_adds_s_d
+#define __msa_adds_u_b __builtin_msa_adds_u_b
+#define __msa_adds_u_h __builtin_msa_adds_u_h
+#define __msa_adds_u_w __builtin_msa_adds_u_w
+#define __msa_adds_u_d __builtin_msa_adds_u_d
+#define __msa_ave_s_b __builtin_msa_ave_s_b
+#define __msa_ave_s_h __builtin_msa_ave_s_h
+#define __msa_ave_s_w __builtin_msa_ave_s_w
+#define __msa_ave_s_d __builtin_msa_ave_s_d
+#define __msa_ave_u_b __builtin_msa_ave_u_b
+#define __msa_ave_u_h __builtin_msa_ave_u_h
+#define __msa_ave_u_w __builtin_msa_ave_u_w
+#define __msa_ave_u_d __builtin_msa_ave_u_d
+#define __msa_aver_s_b __builtin_msa_aver_s_b
+#define __msa_aver_s_h __builtin_msa_aver_s_h
+#define __msa_aver_s_w __builtin_msa_aver_s_w
+#define __msa_aver_s_d __builtin_msa_aver_s_d
+#define __msa_aver_u_b __builtin_msa_aver_u_b
+#define __msa_aver_u_h __builtin_msa_aver_u_h
+#define __msa_aver_u_w __builtin_msa_aver_u_w
+#define __msa_aver_u_d __builtin_msa_aver_u_d
+#define __msa_subs_s_b __builtin_msa_subs_s_b
+#define __msa_subs_s_h __builtin_msa_subs_s_h
+#define __msa_subs_s_w __builtin_msa_subs_s_w
+#define __msa_subs_s_d __builtin_msa_subs_s_d
+#define __msa_subs_u_b __builtin_msa_subs_u_b
+#define __msa_subs_u_h __builtin_msa_subs_u_h
+#define __msa_subs_u_w __builtin_msa_subs_u_w
+#define __msa_subs_u_d __builtin_msa_subs_u_d
+#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
+#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
+#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
+#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
+#define __msa_subsus_u_b __builtin_msa_subsus_u_b
+#define __msa_subsus_u_h __builtin_msa_subsus_u_h
+#define __msa_subsus_u_w __builtin_msa_subsus_u_w
+#define __msa_subsus_u_d __builtin_msa_subsus_u_d
+#define __msa_asub_s_b __builtin_msa_asub_s_b
+#define __msa_asub_s_h __builtin_msa_asub_s_h
+#define __msa_asub_s_w __builtin_msa_asub_s_w
+#define __msa_asub_s_d __builtin_msa_asub_s_d
+#define __msa_asub_u_b __builtin_msa_asub_u_b
+#define __msa_asub_u_h __builtin_msa_asub_u_h
+#define __msa_asub_u_w __builtin_msa_asub_u_w
+#define __msa_asub_u_d __builtin_msa_asub_u_d
+#define __msa_mulv_b __builtin_msa_mulv_b
+#define __msa_mulv_h __builtin_msa_mulv_h
+#define __msa_mulv_w __builtin_msa_mulv_w
+#define __msa_mulv_d __builtin_msa_mulv_d
+#define __msa_maddv_b __builtin_msa_maddv_b
+#define __msa_maddv_h __builtin_msa_maddv_h
+#define __msa_maddv_w __builtin_msa_maddv_w
+#define __msa_maddv_d __builtin_msa_maddv_d
+#define __msa_msubv_b __builtin_msa_msubv_b
+#define __msa_msubv_h __builtin_msa_msubv_h
+#define __msa_msubv_w __builtin_msa_msubv_w
+#define __msa_msubv_d __builtin_msa_msubv_d
+#define __msa_div_s_b __builtin_msa_div_s_b
+#define __msa_div_s_h __builtin_msa_div_s_h
+#define __msa_div_s_w __builtin_msa_div_s_w
+#define __msa_div_s_d __builtin_msa_div_s_d
+#define __msa_div_u_b __builtin_msa_div_u_b
+#define __msa_div_u_h __builtin_msa_div_u_h
+#define __msa_div_u_w __builtin_msa_div_u_w
+#define __msa_div_u_d __builtin_msa_div_u_d
+#define __msa_hadd_s_h __builtin_msa_hadd_s_h
+#define __msa_hadd_s_w __builtin_msa_hadd_s_w
+#define __msa_hadd_s_d __builtin_msa_hadd_s_d
+#define __msa_hadd_u_h __builtin_msa_hadd_u_h
+#define __msa_hadd_u_w __builtin_msa_hadd_u_w
+#define __msa_hadd_u_d __builtin_msa_hadd_u_d
+#define __msa_hsub_s_h __builtin_msa_hsub_s_h
+#define __msa_hsub_s_w __builtin_msa_hsub_s_w
+#define __msa_hsub_s_d __builtin_msa_hsub_s_d
+#define __msa_hsub_u_h __builtin_msa_hsub_u_h
+#define __msa_hsub_u_w __builtin_msa_hsub_u_w
+#define __msa_hsub_u_d __builtin_msa_hsub_u_d
+#define __msa_mod_s_b __builtin_msa_mod_s_b
+#define __msa_mod_s_h __builtin_msa_mod_s_h
+#define __msa_mod_s_w __builtin_msa_mod_s_w
+#define __msa_mod_s_d __builtin_msa_mod_s_d
+#define __msa_mod_u_b __builtin_msa_mod_u_b
+#define __msa_mod_u_h __builtin_msa_mod_u_h
+#define __msa_mod_u_w __builtin_msa_mod_u_w
+#define __msa_mod_u_d __builtin_msa_mod_u_d
+#define __msa_dotp_s_h __builtin_msa_dotp_s_h
+#define __msa_dotp_s_w __builtin_msa_dotp_s_w
+#define __msa_dotp_s_d __builtin_msa_dotp_s_d
+#define __msa_dotp_u_h __builtin_msa_dotp_u_h
+#define __msa_dotp_u_w __builtin_msa_dotp_u_w
+#define __msa_dotp_u_d __builtin_msa_dotp_u_d
+#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
+#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
+#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
+#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
+#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
+#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
+#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
+#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
+#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
+#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
+#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
+#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
+#define __msa_sld_b __builtin_msa_sld_b
+#define __msa_sld_h __builtin_msa_sld_h
+#define __msa_sld_w __builtin_msa_sld_w
+#define __msa_sld_d __builtin_msa_sld_d
+#define __msa_sldi_b __builtin_msa_sldi_b
+#define __msa_sldi_h __builtin_msa_sldi_h
+#define __msa_sldi_w __builtin_msa_sldi_w
+#define __msa_sldi_d __builtin_msa_sldi_d
+#define __msa_splat_b __builtin_msa_splat_b
+#define __msa_splat_h __builtin_msa_splat_h
+#define __msa_splat_w __builtin_msa_splat_w
+#define __msa_splat_d __builtin_msa_splat_d
+#define __msa_splati_b __builtin_msa_splati_b
+#define __msa_splati_h __builtin_msa_splati_h
+#define __msa_splati_w __builtin_msa_splati_w
+#define __msa_splati_d __builtin_msa_splati_d
+#define __msa_pckev_b __builtin_msa_pckev_b
+#define __msa_pckev_h __builtin_msa_pckev_h
+#define __msa_pckev_w __builtin_msa_pckev_w
+#define __msa_pckev_d __builtin_msa_pckev_d
+#define __msa_pckod_b __builtin_msa_pckod_b
+#define __msa_pckod_h __builtin_msa_pckod_h
+#define __msa_pckod_w __builtin_msa_pckod_w
+#define __msa_pckod_d __builtin_msa_pckod_d
+#define __msa_ilvl_b __builtin_msa_ilvl_b
+#define __msa_ilvl_h __builtin_msa_ilvl_h
+#define __msa_ilvl_w __builtin_msa_ilvl_w
+#define __msa_ilvl_d __builtin_msa_ilvl_d
+#define __msa_ilvr_b __builtin_msa_ilvr_b
+#define __msa_ilvr_h __builtin_msa_ilvr_h
+#define __msa_ilvr_w __builtin_msa_ilvr_w
+#define __msa_ilvr_d __builtin_msa_ilvr_d
+#define __msa_ilvev_b __builtin_msa_ilvev_b
+#define __msa_ilvev_h __builtin_msa_ilvev_h
+#define __msa_ilvev_w __builtin_msa_ilvev_w
+#define __msa_ilvev_d __builtin_msa_ilvev_d
+#define __msa_ilvod_b __builtin_msa_ilvod_b
+#define __msa_ilvod_h __builtin_msa_ilvod_h
+#define __msa_ilvod_w __builtin_msa_ilvod_w
+#define __msa_ilvod_d __builtin_msa_ilvod_d
+#define __msa_vshf_b __builtin_msa_vshf_b
+#define __msa_vshf_h __builtin_msa_vshf_h
+#define __msa_vshf_w __builtin_msa_vshf_w
+#define __msa_vshf_d __builtin_msa_vshf_d
+#define __msa_and_v __builtin_msa_and_v
+#define __msa_andi_b __builtin_msa_andi_b
+#define __msa_or_v __builtin_msa_or_v
+#define __msa_ori_b __builtin_msa_ori_b
+#define __msa_nor_v __builtin_msa_nor_v
+#define __msa_nori_b __builtin_msa_nori_b
+#define __msa_xor_v __builtin_msa_xor_v
+#define __msa_xori_b __builtin_msa_xori_b
+#define __msa_bmnz_v __builtin_msa_bmnz_v
+#define __msa_bmnzi_b __builtin_msa_bmnzi_b
+#define __msa_bmz_v __builtin_msa_bmz_v
+#define __msa_bmzi_b __builtin_msa_bmzi_b
+#define __msa_bsel_v __builtin_msa_bsel_v
+#define __msa_bseli_b __builtin_msa_bseli_b
+#define __msa_shf_b __builtin_msa_shf_b
+#define __msa_shf_h __builtin_msa_shf_h
+#define __msa_shf_w __builtin_msa_shf_w
+#define __msa_test_bnz_v __builtin_msa_bnz_v
+#define __msa_test_bz_v __builtin_msa_bz_v
+#define __msa_fill_b __builtin_msa_fill_b
+#define __msa_fill_h __builtin_msa_fill_h
+#define __msa_fill_w __builtin_msa_fill_w
+#define __msa_fill_d __builtin_msa_fill_d
+#define __msa_pcnt_b __builtin_msa_pcnt_b
+#define __msa_pcnt_h __builtin_msa_pcnt_h
+#define __msa_pcnt_w __builtin_msa_pcnt_w
+#define __msa_pcnt_d __builtin_msa_pcnt_d
+#define __msa_nloc_b __builtin_msa_nloc_b
+#define __msa_nloc_h __builtin_msa_nloc_h
+#define __msa_nloc_w __builtin_msa_nloc_w
+#define __msa_nloc_d __builtin_msa_nloc_d
+#define __msa_nlzc_b __builtin_msa_nlzc_b
+#define __msa_nlzc_h __builtin_msa_nlzc_h
+#define __msa_nlzc_w __builtin_msa_nlzc_w
+#define __msa_nlzc_d __builtin_msa_nlzc_d
+#define __msa_copy_s_b __builtin_msa_copy_s_b
+#define __msa_copy_s_h __builtin_msa_copy_s_h
+#define __msa_copy_s_w __builtin_msa_copy_s_w
+#define __msa_copy_s_d __builtin_msa_copy_s_d
+#define __msa_copy_u_b __builtin_msa_copy_u_b
+#define __msa_copy_u_h __builtin_msa_copy_u_h
+#define __msa_copy_u_w __builtin_msa_copy_u_w
+#define __msa_copy_u_d __builtin_msa_copy_u_d
+#define __msa_insert_b __builtin_msa_insert_b
+#define __msa_insert_h __builtin_msa_insert_h
+#define __msa_insert_w __builtin_msa_insert_w
+#define __msa_insert_d __builtin_msa_insert_d
+#define __msa_insve_b __builtin_msa_insve_b
+#define __msa_insve_h __builtin_msa_insve_h
+#define __msa_insve_w __builtin_msa_insve_w
+#define __msa_insve_d __builtin_msa_insve_d
+#define __msa_test_bnz_b __builtin_msa_bnz_b
+#define __msa_test_bnz_h __builtin_msa_bnz_h
+#define __msa_test_bnz_w __builtin_msa_bnz_w
+#define __msa_test_bnz_d __builtin_msa_bnz_d
+#define __msa_test_bz_b __builtin_msa_bz_b
+#define __msa_test_bz_h __builtin_msa_bz_h
+#define __msa_test_bz_w __builtin_msa_bz_w
+#define __msa_test_bz_d __builtin_msa_bz_d
+#define __msa_ldi_b __builtin_msa_ldi_b
+#define __msa_ldi_h __builtin_msa_ldi_h
+#define __msa_ldi_w __builtin_msa_ldi_w
+#define __msa_ldi_d __builtin_msa_ldi_d
+#define __msa_fcaf_w __builtin_msa_fcaf_w
+#define __msa_fcaf_d __builtin_msa_fcaf_d
+#define __msa_fcor_w __builtin_msa_fcor_w
+#define __msa_fcor_d __builtin_msa_fcor_d
+#define __msa_fcun_w __builtin_msa_fcun_w
+#define __msa_fcun_d __builtin_msa_fcun_d
+#define __msa_fcune_w __builtin_msa_fcune_w
+#define __msa_fcune_d __builtin_msa_fcune_d
+#define __msa_fcueq_w __builtin_msa_fcueq_w
+#define __msa_fcueq_d __builtin_msa_fcueq_d
+#define __msa_fceq_w __builtin_msa_fceq_w
+#define __msa_fceq_d __builtin_msa_fceq_d
+#define __msa_fcne_w __builtin_msa_fcne_w
+#define __msa_fcne_d __builtin_msa_fcne_d
+#define __msa_fclt_w __builtin_msa_fclt_w
+#define __msa_fclt_d __builtin_msa_fclt_d
+#define __msa_fcult_w __builtin_msa_fcult_w
+#define __msa_fcult_d __builtin_msa_fcult_d
+#define __msa_fcle_w __builtin_msa_fcle_w
+#define __msa_fcle_d __builtin_msa_fcle_d
+#define __msa_fcule_w __builtin_msa_fcule_w
+#define __msa_fcule_d __builtin_msa_fcule_d
+#define __msa_fsaf_w __builtin_msa_fsaf_w
+#define __msa_fsaf_d __builtin_msa_fsaf_d
+#define __msa_fsor_w __builtin_msa_fsor_w
+#define __msa_fsor_d __builtin_msa_fsor_d
+#define __msa_fsun_w __builtin_msa_fsun_w
+#define __msa_fsun_d __builtin_msa_fsun_d
+#define __msa_fsune_w __builtin_msa_fsune_w
+#define __msa_fsune_d __builtin_msa_fsune_d
+#define __msa_fsueq_w __builtin_msa_fsueq_w
+#define __msa_fsueq_d __builtin_msa_fsueq_d
+#define __msa_fseq_w __builtin_msa_fseq_w
+#define __msa_fseq_d __builtin_msa_fseq_d
+#define __msa_fsne_w __builtin_msa_fsne_w
+#define __msa_fsne_d __builtin_msa_fsne_d
+#define __msa_fslt_w __builtin_msa_fslt_w
+#define __msa_fslt_d __builtin_msa_fslt_d
+#define __msa_fsult_w __builtin_msa_fsult_w
+#define __msa_fsult_d __builtin_msa_fsult_d
+#define __msa_fsle_w __builtin_msa_fsle_w
+#define __msa_fsle_d __builtin_msa_fsle_d
+#define __msa_fsule_w __builtin_msa_fsule_w
+#define __msa_fsule_d __builtin_msa_fsule_d
+#define __msa_fadd_w __builtin_msa_fadd_w
+#define __msa_fadd_d __builtin_msa_fadd_d
+#define __msa_fsub_w __builtin_msa_fsub_w
+#define __msa_fsub_d __builtin_msa_fsub_d
+#define __msa_fmul_w __builtin_msa_fmul_w
+#define __msa_fmul_d __builtin_msa_fmul_d
+#define __msa_fdiv_w __builtin_msa_fdiv_w
+#define __msa_fdiv_d __builtin_msa_fdiv_d
+#define __msa_fmadd_w __builtin_msa_fmadd_w
+#define __msa_fmadd_d __builtin_msa_fmadd_d
+#define __msa_fmsub_w __builtin_msa_fmsub_w
+#define __msa_fmsub_d __builtin_msa_fmsub_d
+#define __msa_fexp2_w __builtin_msa_fexp2_w
+#define __msa_fexp2_d __builtin_msa_fexp2_d
+#define __msa_fexdo_h __builtin_msa_fexdo_h
+#define __msa_fexdo_w __builtin_msa_fexdo_w
+#define __msa_ftq_h __builtin_msa_ftq_h
+#define __msa_ftq_w __builtin_msa_ftq_w
+#define __msa_fmin_w __builtin_msa_fmin_w
+#define __msa_fmin_d __builtin_msa_fmin_d
+#define __msa_fmin_a_w __builtin_msa_fmin_a_w
+#define __msa_fmin_a_d __builtin_msa_fmin_a_d
+#define __msa_fmax_w __builtin_msa_fmax_w
+#define __msa_fmax_d __builtin_msa_fmax_d
+#define __msa_fmax_a_w __builtin_msa_fmax_a_w
+#define __msa_fmax_a_d __builtin_msa_fmax_a_d
+#define __msa_mul_q_h __builtin_msa_mul_q_h
+#define __msa_mul_q_w __builtin_msa_mul_q_w
+#define __msa_mulr_q_h __builtin_msa_mulr_q_h
+#define __msa_mulr_q_w __builtin_msa_mulr_q_w
+#define __msa_madd_q_h __builtin_msa_madd_q_h
+#define __msa_madd_q_w __builtin_msa_madd_q_w
+#define __msa_maddr_q_h __builtin_msa_maddr_q_h
+#define __msa_maddr_q_w __builtin_msa_maddr_q_w
+#define __msa_msub_q_h __builtin_msa_msub_q_h
+#define __msa_msub_q_w __builtin_msa_msub_q_w
+#define __msa_msubr_q_h __builtin_msa_msubr_q_h
+#define __msa_msubr_q_w __builtin_msa_msubr_q_w
+#define __msa_fclass_w __builtin_msa_fclass_w
+#define __msa_fclass_d __builtin_msa_fclass_d
+#define __msa_fsqrt_w __builtin_msa_fsqrt_w
+#define __msa_fsqrt_d __builtin_msa_fsqrt_d
+#define __msa_frcp_w __builtin_msa_frcp_w
+#define __msa_frcp_d __builtin_msa_frcp_d
+#define __msa_frint_w __builtin_msa_frint_w
+#define __msa_frint_d __builtin_msa_frint_d
+#define __msa_frsqrt_w __builtin_msa_frsqrt_w
+#define __msa_frsqrt_d __builtin_msa_frsqrt_d
+#define __msa_flog2_w __builtin_msa_flog2_w
+#define __msa_flog2_d __builtin_msa_flog2_d
+#define __msa_fexupl_w __builtin_msa_fexupl_w
+#define __msa_fexupl_d __builtin_msa_fexupl_d
+#define __msa_fexupr_w __builtin_msa_fexupr_w
+#define __msa_fexupr_d __builtin_msa_fexupr_d
+#define __msa_ffql_w __builtin_msa_ffql_w
+#define __msa_ffql_d __builtin_msa_ffql_d
+#define __msa_ffqr_w __builtin_msa_ffqr_w
+#define __msa_ffqr_d __builtin_msa_ffqr_d
+#define __msa_ftint_s_w __builtin_msa_ftint_s_w
+#define __msa_ftint_s_d __builtin_msa_ftint_s_d
+#define __msa_ftint_u_w __builtin_msa_ftint_u_w
+#define __msa_ftint_u_d __builtin_msa_ftint_u_d
+#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
+#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
+#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
+#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
+#define __msa_ffint_s_w __builtin_msa_ffint_s_w
+#define __msa_ffint_s_d __builtin_msa_ffint_s_d
+#define __msa_ffint_u_w __builtin_msa_ffint_u_w
+#define __msa_ffint_u_d __builtin_msa_ffint_u_d
+#define __msa_cfcmsa __builtin_msa_cfcmsa
+#define __msa_move_v __builtin_msa_move_v
+#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
+#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
+#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
+#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
+#endif /* defined(__mips_msa) */
+#endif /* _MSA_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mwaitxintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mwaitxintrin.h
new file mode 100644
index 000000000..635f2ac6c
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/mwaitxintrin.h
@@ -0,0 +1,47 @@
+/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MWAITXINTRIN_H
+#define _MWAITXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitorx((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
+{
+  __builtin_ia32_mwaitx(__extensions, __hints, __clock);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _MWAITXINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/nmmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/nmmintrin.h
new file mode 100644
index 000000000..57fec1596
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/nmmintrin.h
@@ -0,0 +1,30 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* _NMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/opencl-c.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/opencl-c.h
new file mode 100644
index 000000000..0c25d3127
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/opencl-c.h
@@ -0,0 +1,17055 @@
+//===--- opencl-c.h - OpenCL C language builtin function header -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OPENCL_H_
+#define _OPENCL_H_
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#ifndef cl_khr_depth_images
+#define cl_khr_depth_images
+#endif //cl_khr_depth_images
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#define __ovld __attribute__((overloadable))
+#define __conv __attribute__((convergent))
+
+// Optimizations
+#define __purefn __attribute__((pure))
+#define __cnfn __attribute__((const))
+
+// built-in scalar data types:
+
+/**
+ * An unsigned 8-bit integer.
+ */
+typedef unsigned char uchar;
+
+/**
+ * An unsigned 16-bit integer.
+ */
+typedef unsigned short ushort;
+
+/**
+ * An unsigned 32-bit integer.
+ */
+typedef unsigned int uint;
+
+/**
+ * An unsigned 64-bit integer.
+ */
+typedef unsigned long ulong;
+
+/**
+ * The unsigned integer type of the result of the sizeof operator. This
+ * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __SIZE_TYPE__ size_t;
+
+/**
+ * A signed integer type that is the result of subtracting two pointers.
+ * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
+ * defined in table 4.3 is 32-bits and is a 64-bit signed integer if
+ * CL_DEVICE_ADDRESS_BITS is 64-bits.
+ */
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+/**
+* A signed integer type with the property that any valid pointer to
+* void can be converted to this type, then converted back to pointer
+* to void, and the result will compare equal to the original pointer.
+*/
+typedef __INTPTR_TYPE__ intptr_t;
+
+/**
+* An unsigned integer type with the property that any valid pointer to
+* void can be converted to this type, then converted back to pointer
+* to void, and the result will compare equal to the original pointer.
+*/
+typedef __UINTPTR_TYPE__ uintptr_t;
+
+// built-in vector data types:
+typedef char char2 __attribute__((ext_vector_type(2)));
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef char char4 __attribute__((ext_vector_type(4)));
+typedef char char8 __attribute__((ext_vector_type(8)));
+typedef char char16 __attribute__((ext_vector_type(16)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar8 __attribute__((ext_vector_type(8)));
+typedef uchar uchar16 __attribute__((ext_vector_type(16)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef short short8 __attribute__((ext_vector_type(8)));
+typedef short short16 __attribute__((ext_vector_type(16)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort8 __attribute__((ext_vector_type(8)));
+typedef ushort ushort16 __attribute__((ext_vector_type(16)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef uint uint8 __attribute__((ext_vector_type(8)));
+typedef uint uint16 __attribute__((ext_vector_type(16)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef long long3 __attribute__((ext_vector_type(3)));
+typedef long long4 __attribute__((ext_vector_type(4)));
+typedef long long8 __attribute__((ext_vector_type(8)));
+typedef long long16 __attribute__((ext_vector_type(16)));
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
+typedef ulong ulong3 __attribute__((ext_vector_type(3)));
+typedef ulong ulong4 __attribute__((ext_vector_type(4)));
+typedef ulong ulong8 __attribute__((ext_vector_type(8)));
+typedef ulong ulong16 __attribute__((ext_vector_type(16)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef float float16 __attribute__((ext_vector_type(16)));
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+typedef half half2 __attribute__((ext_vector_type(2)));
+typedef half half3 __attribute__((ext_vector_type(3)));
+typedef half half4 __attribute__((ext_vector_type(4)));
+typedef half half8 __attribute__((ext_vector_type(8)));
+typedef half half16 __attribute__((ext_vector_type(16)));
+#endif
+#ifdef cl_khr_fp64
+#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef double double3 __attribute__((ext_vector_type(3)));
+typedef double double4 __attribute__((ext_vector_type(4)));
+typedef double double8 __attribute__((ext_vector_type(8)));
+typedef double double16 __attribute__((ext_vector_type(16)));
+#endif
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define NULL ((void*)0)
+#endif
+
+/**
+ * Value of maximum non-infinite single-precision floating-point
+ * number.
+ */
+#define MAXFLOAT 0x1.fffffep127f
+
+/**
+ * A positive float constant expression. HUGE_VALF evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VALF (__builtin_huge_valf())
+
+/**
+ * A positive double constant expression. HUGE_VAL evaluates
+ * to +infinity. Used as an error value returned by the built-in
+ * math functions.
+ */
+#define HUGE_VAL (__builtin_huge_val())
+
+/**
+ * A constant expression of type float representing positive or
+ * unsigned infinity.
+ */
+#define INFINITY (__builtin_inff())
+
+/**
+ * A constant expression of type float representing a quiet NaN.
+ */
+#define NAN as_float(INT_MAX)
+
+#define FP_ILOGB0    INT_MIN
+#define FP_ILOGBNAN    INT_MAX
+
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define M_E_F         2.71828182845904523536028747135266250f
+#define M_LOG2E_F     1.44269504088896340735992468100189214f
+#define M_LOG10E_F    0.434294481903251827651128918916605082f
+#define M_LN2_F       0.693147180559945309417232121458176568f
+#define M_LN10_F      2.30258509299404568401799145468436421f
+#define M_PI_F        3.14159265358979323846264338327950288f
+#define M_PI_2_F      1.57079632679489661923132169163975144f
+#define M_PI_4_F      0.785398163397448309615660845819875721f
+#define M_1_PI_F      0.318309886183790671537767526745028724f
+#define M_2_PI_F      0.636619772367581343075535053490057448f
+#define M_2_SQRTPI_F  1.12837916709551257389615890312154517f
+#define M_SQRT2_F     1.41421356237309504880168872420969808f
+#define M_SQRT1_2_F   0.707106781186547524400844362104849039f
+
+#define DBL_DIG 15
+#define DBL_MANT_DIG 53
+#define DBL_MAX_10_EXP +308
+#define DBL_MAX_EXP +1024
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP -1021
+#define DBL_RADIX 2
+#define DBL_MAX 0x1.fffffffffffffp1023
+#define DBL_MIN 0x1.0p-1022
+#define DBL_EPSILON 0x1.0p-52
+
+#define M_E           0x1.5bf0a8b145769p+1
+#define M_LOG2E       0x1.71547652b82fep+0
+#define M_LOG10E      0x1.bcb7b1526e50ep-2
+#define M_LN2         0x1.62e42fefa39efp-1
+#define M_LN10        0x1.26bb1bbb55516p+1
+#define M_PI          0x1.921fb54442d18p+1
+#define M_PI_2        0x1.921fb54442d18p+0
+#define M_PI_4        0x1.921fb54442d18p-1
+#define M_1_PI        0x1.45f306dc9c883p-2
+#define M_2_PI        0x1.45f306dc9c883p-1
+#define M_2_SQRTPI    0x1.20dd750429b6dp+0
+#define M_SQRT2       0x1.6a09e667f3bcdp+0
+#define M_SQRT1_2     0x1.6a09e667f3bcdp-1
+
+#ifdef cl_khr_fp16
+
+#define HALF_DIG 3
+#define HALF_MANT_DIG 11
+#define HALF_MAX_10_EXP +4
+#define HALF_MAX_EXP +16
+#define HALF_MIN_10_EXP -4
+#define HALF_MIN_EXP -13
+#define HALF_RADIX 2
+#define HALF_MAX ((0x1.ffcp15h))
+#define HALF_MIN ((0x1.0p-14h))
+#define HALF_EPSILON ((0x1.0p-10h))
+
+#define M_E_H         2.71828182845904523536028747135266250h
+#define M_LOG2E_H     1.44269504088896340735992468100189214h
+#define M_LOG10E_H    0.434294481903251827651128918916605082h
+#define M_LN2_H       0.693147180559945309417232121458176568h
+#define M_LN10_H      2.30258509299404568401799145468436421h
+#define M_PI_H        3.14159265358979323846264338327950288h
+#define M_PI_2_H      1.57079632679489661923132169163975144h
+#define M_PI_4_H      0.785398163397448309615660845819875721h
+#define M_1_PI_H      0.318309886183790671537767526745028724h
+#define M_2_PI_H      0.636619772367581343075535053490057448h
+#define M_2_SQRTPI_H  1.12837916709551257389615890312154517h
+#define M_SQRT2_H     1.41421356237309504880168872420969808h
+#define M_SQRT1_2_H   0.707106781186547524400844362104849039h
+
+#endif //cl_khr_fp16
+
+#define CHAR_BIT    8
+#define SCHAR_MAX  127
+#define SCHAR_MIN  (-128)
+#define UCHAR_MAX  255
+#define CHAR_MAX  SCHAR_MAX
+#define CHAR_MIN  SCHAR_MIN
+#define USHRT_MAX  65535
+#define SHRT_MAX  32767
+#define SHRT_MIN  (-32768)
+#define UINT_MAX  0xffffffff
+#define INT_MAX    2147483647
+#define INT_MIN    (-2147483647-1)
+#define ULONG_MAX  0xffffffffffffffffUL
+#define LONG_MAX  0x7fffffffffffffffL
+#define LONG_MIN  (-0x7fffffffffffffffL-1)
+
+// OpenCL v1.1/1.2/2.0 s6.2.3 - Explicit conversions
+
+char __ovld __cnfn convert_char_rte(char);
+char __ovld __cnfn convert_char_sat_rte(char);
+char __ovld __cnfn convert_char_rtz(char);
+char __ovld __cnfn convert_char_sat_rtz(char);
+char __ovld __cnfn convert_char_rtp(char);
+char __ovld __cnfn convert_char_sat_rtp(char);
+char __ovld __cnfn convert_char_rtn(char);
+char __ovld __cnfn convert_char_sat_rtn(char);
+char __ovld __cnfn convert_char(char);
+char __ovld __cnfn convert_char_sat(char);
+char __ovld __cnfn convert_char_rte(uchar);
+char __ovld __cnfn convert_char_sat_rte(uchar);
+char __ovld __cnfn convert_char_rtz(uchar);
+char __ovld __cnfn convert_char_sat_rtz(uchar);
+char __ovld __cnfn convert_char_rtp(uchar);
+char __ovld __cnfn convert_char_sat_rtp(uchar);
+char __ovld __cnfn convert_char_rtn(uchar);
+char __ovld __cnfn convert_char_sat_rtn(uchar);
+char __ovld __cnfn convert_char(uchar);
+char __ovld __cnfn convert_char_sat(uchar);
+char __ovld __cnfn convert_char_rte(short);
+char __ovld __cnfn convert_char_sat_rte(short);
+char __ovld __cnfn convert_char_rtz(short);
+char __ovld __cnfn convert_char_sat_rtz(short);
+char __ovld __cnfn convert_char_rtp(short);
+char __ovld __cnfn convert_char_sat_rtp(short);
+char __ovld __cnfn convert_char_rtn(short);
+char __ovld __cnfn convert_char_sat_rtn(short);
+char __ovld __cnfn convert_char(short);
+char __ovld __cnfn convert_char_sat(short);
+char __ovld __cnfn convert_char_rte(ushort);
+char __ovld __cnfn convert_char_sat_rte(ushort);
+char __ovld __cnfn convert_char_rtz(ushort);
+char __ovld __cnfn convert_char_sat_rtz(ushort);
+char __ovld __cnfn convert_char_rtp(ushort);
+char __ovld __cnfn convert_char_sat_rtp(ushort);
+char __ovld __cnfn convert_char_rtn(ushort);
+char __ovld __cnfn convert_char_sat_rtn(ushort);
+char __ovld __cnfn convert_char(ushort);
+char __ovld __cnfn convert_char_sat(ushort);
+char __ovld __cnfn convert_char_rte(int);
+char __ovld __cnfn convert_char_sat_rte(int);
+char __ovld __cnfn convert_char_rtz(int);
+char __ovld __cnfn convert_char_sat_rtz(int);
+char __ovld __cnfn convert_char_rtp(int);
+char __ovld __cnfn convert_char_sat_rtp(int);
+char __ovld __cnfn convert_char_rtn(int);
+char __ovld __cnfn convert_char_sat_rtn(int);
+char __ovld __cnfn convert_char(int);
+char __ovld __cnfn convert_char_sat(int);
+char __ovld __cnfn convert_char_rte(uint);
+char __ovld __cnfn convert_char_sat_rte(uint);
+char __ovld __cnfn convert_char_rtz(uint);
+char __ovld __cnfn convert_char_sat_rtz(uint);
+char __ovld __cnfn convert_char_rtp(uint);
+char __ovld __cnfn convert_char_sat_rtp(uint);
+char __ovld __cnfn convert_char_rtn(uint);
+char __ovld __cnfn convert_char_sat_rtn(uint);
+char __ovld __cnfn convert_char(uint);
+char __ovld __cnfn convert_char_sat(uint);
+char __ovld __cnfn convert_char_rte(long);
+char __ovld __cnfn convert_char_sat_rte(long);
+char __ovld __cnfn convert_char_rtz(long);
+char __ovld __cnfn convert_char_sat_rtz(long);
+char __ovld __cnfn convert_char_rtp(long);
+char __ovld __cnfn convert_char_sat_rtp(long);
+char __ovld __cnfn convert_char_rtn(long);
+char __ovld __cnfn convert_char_sat_rtn(long);
+char __ovld __cnfn convert_char(long);
+char __ovld __cnfn convert_char_sat(long);
+char __ovld __cnfn convert_char_rte(ulong);
+char __ovld __cnfn convert_char_sat_rte(ulong);
+char __ovld __cnfn convert_char_rtz(ulong);
+char __ovld __cnfn convert_char_sat_rtz(ulong);
+char __ovld __cnfn convert_char_rtp(ulong);
+char __ovld __cnfn convert_char_sat_rtp(ulong);
+char __ovld __cnfn convert_char_rtn(ulong);
+char __ovld __cnfn convert_char_sat_rtn(ulong);
+char __ovld __cnfn convert_char(ulong);
+char __ovld __cnfn convert_char_sat(ulong);
+char __ovld __cnfn convert_char_rte(float);
+char __ovld __cnfn convert_char_sat_rte(float);
+char __ovld __cnfn convert_char_rtz(float);
+char __ovld __cnfn convert_char_sat_rtz(float);
+char __ovld __cnfn convert_char_rtp(float);
+char __ovld __cnfn convert_char_sat_rtp(float);
+char __ovld __cnfn convert_char_rtn(float);
+char __ovld __cnfn convert_char_sat_rtn(float);
+char __ovld __cnfn convert_char(float);
+char __ovld __cnfn convert_char_sat(float);
+uchar __ovld __cnfn convert_uchar_rte(char);
+uchar __ovld __cnfn convert_uchar_sat_rte(char);
+uchar __ovld __cnfn convert_uchar_rtz(char);
+uchar __ovld __cnfn convert_uchar_sat_rtz(char);
+uchar __ovld __cnfn convert_uchar_rtp(char);
+uchar __ovld __cnfn convert_uchar_sat_rtp(char);
+uchar __ovld __cnfn convert_uchar_rtn(char);
+uchar __ovld __cnfn convert_uchar_sat_rtn(char);
+uchar __ovld __cnfn convert_uchar(char);
+uchar __ovld __cnfn convert_uchar_sat(char);
+uchar __ovld __cnfn convert_uchar_rte(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rte(uchar);
+uchar __ovld __cnfn convert_uchar_rtz(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtz(uchar);
+uchar __ovld __cnfn convert_uchar_rtp(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtp(uchar);
+uchar __ovld __cnfn convert_uchar_rtn(uchar);
+uchar __ovld __cnfn convert_uchar_sat_rtn(uchar);
+uchar __ovld __cnfn convert_uchar(uchar);
+uchar __ovld __cnfn convert_uchar_sat(uchar);
+uchar __ovld __cnfn convert_uchar_rte(short);
+uchar __ovld __cnfn convert_uchar_sat_rte(short);
+uchar __ovld __cnfn convert_uchar_rtz(short);
+uchar __ovld __cnfn convert_uchar_sat_rtz(short);
+uchar __ovld __cnfn convert_uchar_rtp(short);
+uchar __ovld __cnfn convert_uchar_sat_rtp(short);
+uchar __ovld __cnfn convert_uchar_rtn(short);
+uchar __ovld __cnfn convert_uchar_sat_rtn(short);
+uchar __ovld __cnfn convert_uchar(short);
+uchar __ovld __cnfn convert_uchar_sat(short);
+uchar __ovld __cnfn convert_uchar_rte(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rte(ushort);
+uchar __ovld __cnfn convert_uchar_rtz(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtz(ushort);
+uchar __ovld __cnfn convert_uchar_rtp(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtp(ushort);
+uchar __ovld __cnfn convert_uchar_rtn(ushort);
+uchar __ovld __cnfn convert_uchar_sat_rtn(ushort);
+uchar __ovld __cnfn convert_uchar(ushort);
+uchar __ovld __cnfn convert_uchar_sat(ushort);
+uchar __ovld __cnfn convert_uchar_rte(int);
+uchar __ovld __cnfn convert_uchar_sat_rte(int);
+uchar __ovld __cnfn convert_uchar_rtz(int);
+uchar __ovld __cnfn convert_uchar_sat_rtz(int);
+uchar __ovld __cnfn convert_uchar_rtp(int);
+uchar __ovld __cnfn convert_uchar_sat_rtp(int);
+uchar __ovld __cnfn convert_uchar_rtn(int);
+uchar __ovld __cnfn convert_uchar_sat_rtn(int);
+uchar __ovld __cnfn convert_uchar(int);
+uchar __ovld __cnfn convert_uchar_sat(int);
+uchar __ovld __cnfn convert_uchar_rte(uint);
+uchar __ovld __cnfn convert_uchar_sat_rte(uint);
+uchar __ovld __cnfn convert_uchar_rtz(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtz(uint);
+uchar __ovld __cnfn convert_uchar_rtp(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtp(uint);
+uchar __ovld __cnfn convert_uchar_rtn(uint);
+uchar __ovld __cnfn convert_uchar_sat_rtn(uint);
+uchar __ovld __cnfn convert_uchar(uint);
+uchar __ovld __cnfn convert_uchar_sat(uint);
+uchar __ovld __cnfn convert_uchar_rte(long);
+uchar __ovld __cnfn convert_uchar_sat_rte(long);
+uchar __ovld __cnfn convert_uchar_rtz(long);
+uchar __ovld __cnfn convert_uchar_sat_rtz(long);
+uchar __ovld __cnfn convert_uchar_rtp(long);
+uchar __ovld __cnfn convert_uchar_sat_rtp(long);
+uchar __ovld __cnfn convert_uchar_rtn(long);
+uchar __ovld __cnfn convert_uchar_sat_rtn(long);
+uchar __ovld __cnfn convert_uchar(long);
+uchar __ovld __cnfn convert_uchar_sat(long);
+uchar __ovld __cnfn convert_uchar_rte(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rte(ulong);
+uchar __ovld __cnfn convert_uchar_rtz(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtz(ulong);
+uchar __ovld __cnfn convert_uchar_rtp(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtp(ulong);
+uchar __ovld __cnfn convert_uchar_rtn(ulong);
+uchar __ovld __cnfn convert_uchar_sat_rtn(ulong);
+uchar __ovld __cnfn convert_uchar(ulong);
+uchar __ovld __cnfn convert_uchar_sat(ulong);
+uchar __ovld __cnfn convert_uchar_rte(float);
+uchar __ovld __cnfn convert_uchar_sat_rte(float);
+uchar __ovld __cnfn convert_uchar_rtz(float);
+uchar __ovld __cnfn convert_uchar_sat_rtz(float);
+uchar __ovld __cnfn convert_uchar_rtp(float);
+uchar __ovld __cnfn convert_uchar_sat_rtp(float);
+uchar __ovld __cnfn convert_uchar_rtn(float);
+uchar __ovld __cnfn convert_uchar_sat_rtn(float);
+uchar __ovld __cnfn convert_uchar(float);
+uchar __ovld __cnfn convert_uchar_sat(float);
+
+short __ovld __cnfn convert_short_rte(char);
+short __ovld __cnfn convert_short_sat_rte(char);
+short __ovld __cnfn convert_short_rtz(char);
+short __ovld __cnfn convert_short_sat_rtz(char);
+short __ovld __cnfn convert_short_rtp(char);
+short __ovld __cnfn convert_short_sat_rtp(char);
+short __ovld __cnfn convert_short_rtn(char);
+short __ovld __cnfn convert_short_sat_rtn(char);
+short __ovld __cnfn convert_short(char);
+short __ovld __cnfn convert_short_sat(char);
+short __ovld __cnfn convert_short_rte(uchar);
+short __ovld __cnfn convert_short_sat_rte(uchar);
+short __ovld __cnfn convert_short_rtz(uchar);
+short __ovld __cnfn convert_short_sat_rtz(uchar);
+short __ovld __cnfn convert_short_rtp(uchar);
+short __ovld __cnfn convert_short_sat_rtp(uchar);
+short __ovld __cnfn convert_short_rtn(uchar);
+short __ovld __cnfn convert_short_sat_rtn(uchar);
+short __ovld __cnfn convert_short(uchar);
+short __ovld __cnfn convert_short_sat(uchar);
+short __ovld __cnfn convert_short_rte(short);
+short __ovld __cnfn convert_short_sat_rte(short);
+short __ovld __cnfn convert_short_rtz(short);
+short __ovld __cnfn convert_short_sat_rtz(short);
+short __ovld __cnfn convert_short_rtp(short);
+short __ovld __cnfn convert_short_sat_rtp(short);
+short __ovld __cnfn convert_short_rtn(short);
+short __ovld __cnfn convert_short_sat_rtn(short);
+short __ovld __cnfn convert_short(short);
+short __ovld __cnfn convert_short_sat(short);
+short __ovld __cnfn convert_short_rte(ushort);
+short __ovld __cnfn convert_short_sat_rte(ushort);
+short __ovld __cnfn convert_short_rtz(ushort);
+short __ovld __cnfn convert_short_sat_rtz(ushort);
+short __ovld __cnfn convert_short_rtp(ushort);
+short __ovld __cnfn convert_short_sat_rtp(ushort);
+short __ovld __cnfn convert_short_rtn(ushort);
+short __ovld __cnfn convert_short_sat_rtn(ushort);
+short __ovld __cnfn convert_short(ushort);
+short __ovld __cnfn convert_short_sat(ushort);
+short __ovld __cnfn convert_short_rte(int);
+short __ovld __cnfn convert_short_sat_rte(int);
+short __ovld __cnfn convert_short_rtz(int);
+short __ovld __cnfn convert_short_sat_rtz(int);
+short __ovld __cnfn convert_short_rtp(int);
+short __ovld __cnfn convert_short_sat_rtp(int);
+short __ovld __cnfn convert_short_rtn(int);
+short __ovld __cnfn convert_short_sat_rtn(int);
+short __ovld __cnfn convert_short(int);
+short __ovld __cnfn convert_short_sat(int);
+short __ovld __cnfn convert_short_rte(uint);
+short __ovld __cnfn convert_short_sat_rte(uint);
+short __ovld __cnfn convert_short_rtz(uint);
+short __ovld __cnfn convert_short_sat_rtz(uint);
+short __ovld __cnfn convert_short_rtp(uint);
+short __ovld __cnfn convert_short_sat_rtp(uint);
+short __ovld __cnfn convert_short_rtn(uint);
+short __ovld __cnfn convert_short_sat_rtn(uint);
+short __ovld __cnfn convert_short(uint);
+short __ovld __cnfn convert_short_sat(uint);
+short __ovld __cnfn convert_short_rte(long);
+short __ovld __cnfn convert_short_sat_rte(long);
+short __ovld __cnfn convert_short_rtz(long);
+short __ovld __cnfn convert_short_sat_rtz(long);
+short __ovld __cnfn convert_short_rtp(long);
+short __ovld __cnfn convert_short_sat_rtp(long);
+short __ovld __cnfn convert_short_rtn(long);
+short __ovld __cnfn convert_short_sat_rtn(long);
+short __ovld __cnfn convert_short(long);
+short __ovld __cnfn convert_short_sat(long);
+short __ovld __cnfn convert_short_rte(ulong);
+short __ovld __cnfn convert_short_sat_rte(ulong);
+short __ovld __cnfn convert_short_rtz(ulong);
+short __ovld __cnfn convert_short_sat_rtz(ulong);
+short __ovld __cnfn convert_short_rtp(ulong);
+short __ovld __cnfn convert_short_sat_rtp(ulong);
+short __ovld __cnfn convert_short_rtn(ulong);
+short __ovld __cnfn convert_short_sat_rtn(ulong);
+short __ovld __cnfn convert_short(ulong);
+short __ovld __cnfn convert_short_sat(ulong);
+short __ovld __cnfn convert_short_rte(float);
+short __ovld __cnfn convert_short_sat_rte(float);
+short __ovld __cnfn convert_short_rtz(float);
+short __ovld __cnfn convert_short_sat_rtz(float);
+short __ovld __cnfn convert_short_rtp(float);
+short __ovld __cnfn convert_short_sat_rtp(float);
+short __ovld __cnfn convert_short_rtn(float);
+short __ovld __cnfn convert_short_sat_rtn(float);
+short __ovld __cnfn convert_short(float);
+short __ovld __cnfn convert_short_sat(float);
+ushort __ovld __cnfn convert_ushort_rte(char);
+ushort __ovld __cnfn convert_ushort_sat_rte(char);
+ushort __ovld __cnfn convert_ushort_rtz(char);
+ushort __ovld __cnfn convert_ushort_sat_rtz(char);
+ushort __ovld __cnfn convert_ushort_rtp(char);
+ushort __ovld __cnfn convert_ushort_sat_rtp(char);
+ushort __ovld __cnfn convert_ushort_rtn(char);
+ushort __ovld __cnfn convert_ushort_sat_rtn(char);
+ushort __ovld __cnfn convert_ushort(char);
+ushort __ovld __cnfn convert_ushort_sat(char);
+ushort __ovld __cnfn convert_ushort_rte(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rte(uchar);
+ushort __ovld __cnfn convert_ushort_rtz(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtz(uchar);
+ushort __ovld __cnfn convert_ushort_rtp(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtp(uchar);
+ushort __ovld __cnfn convert_ushort_rtn(uchar);
+ushort __ovld __cnfn convert_ushort_sat_rtn(uchar);
+ushort __ovld __cnfn convert_ushort(uchar);
+ushort __ovld __cnfn convert_ushort_sat(uchar);
+ushort __ovld __cnfn convert_ushort_rte(short);
+ushort __ovld __cnfn convert_ushort_sat_rte(short);
+ushort __ovld __cnfn convert_ushort_rtz(short);
+ushort __ovld __cnfn convert_ushort_sat_rtz(short);
+ushort __ovld __cnfn convert_ushort_rtp(short);
+ushort __ovld __cnfn convert_ushort_sat_rtp(short);
+ushort __ovld __cnfn convert_ushort_rtn(short);
+ushort __ovld __cnfn convert_ushort_sat_rtn(short);
+ushort __ovld __cnfn convert_ushort(short);
+ushort __ovld __cnfn convert_ushort_sat(short);
+ushort __ovld __cnfn convert_ushort_rte(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rte(ushort);
+ushort __ovld __cnfn convert_ushort_rtz(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtz(ushort);
+ushort __ovld __cnfn convert_ushort_rtp(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtp(ushort);
+ushort __ovld __cnfn convert_ushort_rtn(ushort);
+ushort __ovld __cnfn convert_ushort_sat_rtn(ushort);
+ushort __ovld __cnfn convert_ushort(ushort);
+ushort __ovld __cnfn convert_ushort_sat(ushort);
+ushort __ovld __cnfn convert_ushort_rte(int);
+ushort __ovld __cnfn convert_ushort_sat_rte(int);
+ushort __ovld __cnfn convert_ushort_rtz(int);
+ushort __ovld __cnfn convert_ushort_sat_rtz(int);
+ushort __ovld __cnfn convert_ushort_rtp(int);
+ushort __ovld __cnfn convert_ushort_sat_rtp(int);
+ushort __ovld __cnfn convert_ushort_rtn(int);
+ushort __ovld __cnfn convert_ushort_sat_rtn(int);
+ushort __ovld __cnfn convert_ushort(int);
+ushort __ovld __cnfn convert_ushort_sat(int);
+ushort __ovld __cnfn convert_ushort_rte(uint);
+ushort __ovld __cnfn convert_ushort_sat_rte(uint);
+ushort __ovld __cnfn convert_ushort_rtz(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtz(uint);
+ushort __ovld __cnfn convert_ushort_rtp(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtp(uint);
+ushort __ovld __cnfn convert_ushort_rtn(uint);
+ushort __ovld __cnfn convert_ushort_sat_rtn(uint);
+ushort __ovld __cnfn convert_ushort(uint);
+ushort __ovld __cnfn convert_ushort_sat(uint);
+ushort __ovld __cnfn convert_ushort_rte(long);
+ushort __ovld __cnfn convert_ushort_sat_rte(long);
+ushort __ovld __cnfn convert_ushort_rtz(long);
+ushort __ovld __cnfn convert_ushort_sat_rtz(long);
+ushort __ovld __cnfn convert_ushort_rtp(long);
+ushort __ovld __cnfn convert_ushort_sat_rtp(long);
+ushort __ovld __cnfn convert_ushort_rtn(long);
+ushort __ovld __cnfn convert_ushort_sat_rtn(long);
+ushort __ovld __cnfn convert_ushort(long);
+ushort __ovld __cnfn convert_ushort_sat(long);
+ushort __ovld __cnfn convert_ushort_rte(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rte(ulong);
+ushort __ovld __cnfn convert_ushort_rtz(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtz(ulong);
+ushort __ovld __cnfn convert_ushort_rtp(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtp(ulong);
+ushort __ovld __cnfn convert_ushort_rtn(ulong);
+ushort __ovld __cnfn convert_ushort_sat_rtn(ulong);
+ushort __ovld __cnfn convert_ushort(ulong);
+ushort __ovld __cnfn convert_ushort_sat(ulong);
+ushort __ovld __cnfn convert_ushort_rte(float);
+ushort __ovld __cnfn convert_ushort_sat_rte(float);
+ushort __ovld __cnfn convert_ushort_rtz(float);
+ushort __ovld __cnfn convert_ushort_sat_rtz(float);
+ushort __ovld __cnfn convert_ushort_rtp(float);
+ushort __ovld __cnfn convert_ushort_sat_rtp(float);
+ushort __ovld __cnfn convert_ushort_rtn(float);
+ushort __ovld __cnfn convert_ushort_sat_rtn(float);
+ushort __ovld __cnfn convert_ushort(float);
+ushort __ovld __cnfn convert_ushort_sat(float);
+int __ovld __cnfn convert_int_rte(char);
+int __ovld __cnfn convert_int_sat_rte(char);
+int __ovld __cnfn convert_int_rtz(char);
+int __ovld __cnfn convert_int_sat_rtz(char);
+int __ovld __cnfn convert_int_rtp(char);
+int __ovld __cnfn convert_int_sat_rtp(char);
+int __ovld __cnfn convert_int_rtn(char);
+int __ovld __cnfn convert_int_sat_rtn(char);
+int __ovld __cnfn convert_int(char);
+int __ovld __cnfn convert_int_sat(char);
+int __ovld __cnfn convert_int_rte(uchar);
+int __ovld __cnfn convert_int_sat_rte(uchar);
+int __ovld __cnfn convert_int_rtz(uchar);
+int __ovld __cnfn convert_int_sat_rtz(uchar);
+int __ovld __cnfn convert_int_rtp(uchar);
+int __ovld __cnfn convert_int_sat_rtp(uchar);
+int __ovld __cnfn convert_int_rtn(uchar);
+int __ovld __cnfn convert_int_sat_rtn(uchar);
+int __ovld __cnfn convert_int(uchar);
+int __ovld __cnfn convert_int_sat(uchar);
+int __ovld __cnfn convert_int_rte(short);
+int __ovld __cnfn convert_int_sat_rte(short);
+int __ovld __cnfn convert_int_rtz(short);
+int __ovld __cnfn convert_int_sat_rtz(short);
+int __ovld __cnfn convert_int_rtp(short);
+int __ovld __cnfn convert_int_sat_rtp(short);
+int __ovld __cnfn convert_int_rtn(short);
+int __ovld __cnfn convert_int_sat_rtn(short);
+int __ovld __cnfn convert_int(short);
+int __ovld __cnfn convert_int_sat(short);
+int __ovld __cnfn convert_int_rte(ushort);
+int __ovld __cnfn convert_int_sat_rte(ushort);
+int __ovld __cnfn convert_int_rtz(ushort);
+int __ovld __cnfn convert_int_sat_rtz(ushort);
+int __ovld __cnfn convert_int_rtp(ushort);
+int __ovld __cnfn convert_int_sat_rtp(ushort);
+int __ovld __cnfn convert_int_rtn(ushort);
+int __ovld __cnfn convert_int_sat_rtn(ushort);
+int __ovld __cnfn convert_int(ushort);
+int __ovld __cnfn convert_int_sat(ushort);
+int __ovld __cnfn convert_int_rte(int);
+int __ovld __cnfn convert_int_sat_rte(int);
+int __ovld __cnfn convert_int_rtz(int);
+int __ovld __cnfn convert_int_sat_rtz(int);
+int __ovld __cnfn convert_int_rtp(int);
+int __ovld __cnfn convert_int_sat_rtp(int);
+int __ovld __cnfn convert_int_rtn(int);
+int __ovld __cnfn convert_int_sat_rtn(int);
+int __ovld __cnfn convert_int(int);
+int __ovld __cnfn convert_int_sat(int);
+int __ovld __cnfn convert_int_rte(uint);
+int __ovld __cnfn convert_int_sat_rte(uint);
+int __ovld __cnfn convert_int_rtz(uint);
+int __ovld __cnfn convert_int_sat_rtz(uint);
+int __ovld __cnfn convert_int_rtp(uint);
+int __ovld __cnfn convert_int_sat_rtp(uint);
+int __ovld __cnfn convert_int_rtn(uint);
+int __ovld __cnfn convert_int_sat_rtn(uint);
+int __ovld __cnfn convert_int(uint);
+int __ovld __cnfn convert_int_sat(uint);
+int __ovld __cnfn convert_int_rte(long);
+int __ovld __cnfn convert_int_sat_rte(long);
+int __ovld __cnfn convert_int_rtz(long);
+int __ovld __cnfn convert_int_sat_rtz(long);
+int __ovld __cnfn convert_int_rtp(long);
+int __ovld __cnfn convert_int_sat_rtp(long);
+int __ovld __cnfn convert_int_rtn(long);
+int __ovld __cnfn convert_int_sat_rtn(long);
+int __ovld __cnfn convert_int(long);
+int __ovld __cnfn convert_int_sat(long);
+int __ovld __cnfn convert_int_rte(ulong);
+int __ovld __cnfn convert_int_sat_rte(ulong);
+int __ovld __cnfn convert_int_rtz(ulong);
+int __ovld __cnfn convert_int_sat_rtz(ulong);
+int __ovld __cnfn convert_int_rtp(ulong);
+int __ovld __cnfn convert_int_sat_rtp(ulong);
+int __ovld __cnfn convert_int_rtn(ulong);
+int __ovld __cnfn convert_int_sat_rtn(ulong);
+int __ovld __cnfn convert_int(ulong);
+int __ovld __cnfn convert_int_sat(ulong);
+int __ovld __cnfn convert_int_rte(float);
+int __ovld __cnfn convert_int_sat_rte(float);
+int __ovld __cnfn convert_int_rtz(float);
+int __ovld __cnfn convert_int_sat_rtz(float);
+int __ovld __cnfn convert_int_rtp(float);
+int __ovld __cnfn convert_int_sat_rtp(float);
+int __ovld __cnfn convert_int_rtn(float);
+int __ovld __cnfn convert_int_sat_rtn(float);
+int __ovld __cnfn convert_int(float);
+int __ovld __cnfn convert_int_sat(float);
+uint __ovld __cnfn convert_uint_rte(char);
+uint __ovld __cnfn convert_uint_sat_rte(char);
+uint __ovld __cnfn convert_uint_rtz(char);
+uint __ovld __cnfn convert_uint_sat_rtz(char);
+uint __ovld __cnfn convert_uint_rtp(char);
+uint __ovld __cnfn convert_uint_sat_rtp(char);
+uint __ovld __cnfn convert_uint_rtn(char);
+uint __ovld __cnfn convert_uint_sat_rtn(char);
+uint __ovld __cnfn convert_uint(char);
+uint __ovld __cnfn convert_uint_sat(char);
+uint __ovld __cnfn convert_uint_rte(uchar);
+uint __ovld __cnfn convert_uint_sat_rte(uchar);
+uint __ovld __cnfn convert_uint_rtz(uchar);
+uint __ovld __cnfn convert_uint_sat_rtz(uchar);
+uint __ovld __cnfn convert_uint_rtp(uchar);
+uint __ovld __cnfn convert_uint_sat_rtp(uchar);
+uint __ovld __cnfn convert_uint_rtn(uchar);
+uint __ovld __cnfn convert_uint_sat_rtn(uchar);
+uint __ovld __cnfn convert_uint(uchar);
+uint __ovld __cnfn convert_uint_sat(uchar);
+uint __ovld __cnfn convert_uint_rte(short);
+uint __ovld __cnfn convert_uint_sat_rte(short);
+uint __ovld __cnfn convert_uint_rtz(short);
+uint __ovld __cnfn convert_uint_sat_rtz(short);
+uint __ovld __cnfn convert_uint_rtp(short);
+uint __ovld __cnfn convert_uint_sat_rtp(short);
+uint __ovld __cnfn convert_uint_rtn(short);
+uint __ovld __cnfn convert_uint_sat_rtn(short);
+uint __ovld __cnfn convert_uint(short);
+uint __ovld __cnfn convert_uint_sat(short);
+uint __ovld __cnfn convert_uint_rte(ushort);
+uint __ovld __cnfn convert_uint_sat_rte(ushort);
+uint __ovld __cnfn convert_uint_rtz(ushort);
+uint __ovld __cnfn convert_uint_sat_rtz(ushort);
+uint __ovld __cnfn convert_uint_rtp(ushort);
+uint __ovld __cnfn convert_uint_sat_rtp(ushort);
+uint __ovld __cnfn convert_uint_rtn(ushort);
+uint __ovld __cnfn convert_uint_sat_rtn(ushort);
+uint __ovld __cnfn convert_uint(ushort);
+uint __ovld __cnfn convert_uint_sat(ushort);
+uint __ovld __cnfn convert_uint_rte(int);
+uint __ovld __cnfn convert_uint_sat_rte(int);
+uint __ovld __cnfn convert_uint_rtz(int);
+uint __ovld __cnfn convert_uint_sat_rtz(int);
+uint __ovld __cnfn convert_uint_rtp(int);
+uint __ovld __cnfn convert_uint_sat_rtp(int);
+uint __ovld __cnfn convert_uint_rtn(int);
+uint __ovld __cnfn convert_uint_sat_rtn(int);
+uint __ovld __cnfn convert_uint(int);
+uint __ovld __cnfn convert_uint_sat(int);
+uint __ovld __cnfn convert_uint_rte(uint);
+uint __ovld __cnfn convert_uint_sat_rte(uint);
+uint __ovld __cnfn convert_uint_rtz(uint);
+uint __ovld __cnfn convert_uint_sat_rtz(uint);
+uint __ovld __cnfn convert_uint_rtp(uint);
+uint __ovld __cnfn convert_uint_sat_rtp(uint);
+uint __ovld __cnfn convert_uint_rtn(uint);
+uint __ovld __cnfn convert_uint_sat_rtn(uint);
+uint __ovld __cnfn convert_uint(uint);
+uint __ovld __cnfn convert_uint_sat(uint);
+uint __ovld __cnfn convert_uint_rte(long);
+uint __ovld __cnfn convert_uint_sat_rte(long);
+uint __ovld __cnfn convert_uint_rtz(long);
+uint __ovld __cnfn convert_uint_sat_rtz(long);
+uint __ovld __cnfn convert_uint_rtp(long);
+uint __ovld __cnfn convert_uint_sat_rtp(long);
+uint __ovld __cnfn convert_uint_rtn(long);
+uint __ovld __cnfn convert_uint_sat_rtn(long);
+uint __ovld __cnfn convert_uint(long);
+uint __ovld __cnfn convert_uint_sat(long);
+uint __ovld __cnfn convert_uint_rte(ulong);
+uint __ovld __cnfn convert_uint_sat_rte(ulong);
+uint __ovld __cnfn convert_uint_rtz(ulong);
+uint __ovld __cnfn convert_uint_sat_rtz(ulong);
+uint __ovld __cnfn convert_uint_rtp(ulong);
+uint __ovld __cnfn convert_uint_sat_rtp(ulong);
+uint __ovld __cnfn convert_uint_rtn(ulong);
+uint __ovld __cnfn convert_uint_sat_rtn(ulong);
+uint __ovld __cnfn convert_uint(ulong);
+uint __ovld __cnfn convert_uint_sat(ulong);
+uint __ovld __cnfn convert_uint_rte(float);
+uint __ovld __cnfn convert_uint_sat_rte(float);
+uint __ovld __cnfn convert_uint_rtz(float);
+uint __ovld __cnfn convert_uint_sat_rtz(float);
+uint __ovld __cnfn convert_uint_rtp(float);
+uint __ovld __cnfn convert_uint_sat_rtp(float);
+uint __ovld __cnfn convert_uint_rtn(float);
+uint __ovld __cnfn convert_uint_sat_rtn(float);
+uint __ovld __cnfn convert_uint(float);
+uint __ovld __cnfn convert_uint_sat(float);
+long __ovld __cnfn convert_long_rte(char);
+long __ovld __cnfn convert_long_sat_rte(char);
+long __ovld __cnfn convert_long_rtz(char);
+long __ovld __cnfn convert_long_sat_rtz(char);
+long __ovld __cnfn convert_long_rtp(char);
+long __ovld __cnfn convert_long_sat_rtp(char);
+long __ovld __cnfn convert_long_rtn(char);
+long __ovld __cnfn convert_long_sat_rtn(char);
+long __ovld __cnfn convert_long(char);
+long __ovld __cnfn convert_long_sat(char);
+long __ovld __cnfn convert_long_rte(uchar);
+long __ovld __cnfn convert_long_sat_rte(uchar);
+long __ovld __cnfn convert_long_rtz(uchar);
+long __ovld __cnfn convert_long_sat_rtz(uchar);
+long __ovld __cnfn convert_long_rtp(uchar);
+long __ovld __cnfn convert_long_sat_rtp(uchar);
+long __ovld __cnfn convert_long_rtn(uchar);
+long __ovld __cnfn convert_long_sat_rtn(uchar);
+long __ovld __cnfn convert_long(uchar);
+long __ovld __cnfn convert_long_sat(uchar);
+long __ovld __cnfn convert_long_rte(short);
+long __ovld __cnfn convert_long_sat_rte(short);
+long __ovld __cnfn convert_long_rtz(short);
+long __ovld __cnfn convert_long_sat_rtz(short);
+long __ovld __cnfn convert_long_rtp(short);
+long __ovld __cnfn convert_long_sat_rtp(short);
+long __ovld __cnfn convert_long_rtn(short);
+long __ovld __cnfn convert_long_sat_rtn(short);
+long __ovld __cnfn convert_long(short);
+long __ovld __cnfn convert_long_sat(short);
+long __ovld __cnfn convert_long_rte(ushort);
+long __ovld __cnfn convert_long_sat_rte(ushort);
+long __ovld __cnfn convert_long_rtz(ushort);
+long __ovld __cnfn convert_long_sat_rtz(ushort);
+long __ovld __cnfn convert_long_rtp(ushort);
+long __ovld __cnfn convert_long_sat_rtp(ushort);
+long __ovld __cnfn convert_long_rtn(ushort);
+long __ovld __cnfn convert_long_sat_rtn(ushort);
+long __ovld __cnfn convert_long(ushort);
+long __ovld __cnfn convert_long_sat(ushort);
+long __ovld __cnfn convert_long_rte(int);
+long __ovld __cnfn convert_long_sat_rte(int);
+long __ovld __cnfn convert_long_rtz(int);
+long __ovld __cnfn convert_long_sat_rtz(int);
+long __ovld __cnfn convert_long_rtp(int);
+long __ovld __cnfn convert_long_sat_rtp(int);
+long __ovld __cnfn convert_long_rtn(int);
+long __ovld __cnfn convert_long_sat_rtn(int);
+long __ovld __cnfn convert_long(int);
+long __ovld __cnfn convert_long_sat(int);
+long __ovld __cnfn convert_long_rte(uint);
+long __ovld __cnfn convert_long_sat_rte(uint);
+long __ovld __cnfn convert_long_rtz(uint);
+long __ovld __cnfn convert_long_sat_rtz(uint);
+long __ovld __cnfn convert_long_rtp(uint);
+long __ovld __cnfn convert_long_sat_rtp(uint);
+long __ovld __cnfn convert_long_rtn(uint);
+long __ovld __cnfn convert_long_sat_rtn(uint);
+long __ovld __cnfn convert_long(uint);
+long __ovld __cnfn convert_long_sat(uint);
+long __ovld __cnfn convert_long_rte(long);
+long __ovld __cnfn convert_long_sat_rte(long);
+long __ovld __cnfn convert_long_rtz(long);
+long __ovld __cnfn convert_long_sat_rtz(long);
+long __ovld __cnfn convert_long_rtp(long);
+long __ovld __cnfn convert_long_sat_rtp(long);
+long __ovld __cnfn convert_long_rtn(long);
+long __ovld __cnfn convert_long_sat_rtn(long);
+long __ovld __cnfn convert_long(long);
+long __ovld __cnfn convert_long_sat(long);
+long __ovld __cnfn convert_long_rte(ulong);
+long __ovld __cnfn convert_long_sat_rte(ulong);
+long __ovld __cnfn convert_long_rtz(ulong);
+long __ovld __cnfn convert_long_sat_rtz(ulong);
+long __ovld __cnfn convert_long_rtp(ulong);
+long __ovld __cnfn convert_long_sat_rtp(ulong);
+long __ovld __cnfn convert_long_rtn(ulong);
+long __ovld __cnfn convert_long_sat_rtn(ulong);
+long __ovld __cnfn convert_long(ulong);
+long __ovld __cnfn convert_long_sat(ulong);
+long __ovld __cnfn convert_long_rte(float);
+long __ovld __cnfn convert_long_sat_rte(float);
+long __ovld __cnfn convert_long_rtz(float);
+long __ovld __cnfn convert_long_sat_rtz(float);
+long __ovld __cnfn convert_long_rtp(float);
+long __ovld __cnfn convert_long_sat_rtp(float);
+long __ovld __cnfn convert_long_rtn(float);
+long __ovld __cnfn convert_long_sat_rtn(float);
+long __ovld __cnfn convert_long(float);
+long __ovld __cnfn convert_long_sat(float);
+ulong __ovld __cnfn convert_ulong_rte(char);
+ulong __ovld __cnfn convert_ulong_sat_rte(char);
+ulong __ovld __cnfn convert_ulong_rtz(char);
+ulong __ovld __cnfn convert_ulong_sat_rtz(char);
+ulong __ovld __cnfn convert_ulong_rtp(char);
+ulong __ovld __cnfn convert_ulong_sat_rtp(char);
+ulong __ovld __cnfn convert_ulong_rtn(char);
+ulong __ovld __cnfn convert_ulong_sat_rtn(char);
+ulong __ovld __cnfn convert_ulong(char);
+ulong __ovld __cnfn convert_ulong_sat(char);
+ulong __ovld __cnfn convert_ulong_rte(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rte(uchar);
+ulong __ovld __cnfn convert_ulong_rtz(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtz(uchar);
+ulong __ovld __cnfn convert_ulong_rtp(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtp(uchar);
+ulong __ovld __cnfn convert_ulong_rtn(uchar);
+ulong __ovld __cnfn convert_ulong_sat_rtn(uchar);
+ulong __ovld __cnfn convert_ulong(uchar);
+ulong __ovld __cnfn convert_ulong_sat(uchar);
+ulong __ovld __cnfn convert_ulong_rte(short);
+ulong __ovld __cnfn convert_ulong_sat_rte(short);
+ulong __ovld __cnfn convert_ulong_rtz(short);
+ulong __ovld __cnfn convert_ulong_sat_rtz(short);
+ulong __ovld __cnfn convert_ulong_rtp(short);
+ulong __ovld __cnfn convert_ulong_sat_rtp(short);
+ulong __ovld __cnfn convert_ulong_rtn(short);
+ulong __ovld __cnfn convert_ulong_sat_rtn(short);
+ulong __ovld __cnfn convert_ulong(short);
+ulong __ovld __cnfn convert_ulong_sat(short);
+ulong __ovld __cnfn convert_ulong_rte(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rte(ushort);
+ulong __ovld __cnfn convert_ulong_rtz(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtz(ushort);
+ulong __ovld __cnfn convert_ulong_rtp(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtp(ushort);
+ulong __ovld __cnfn convert_ulong_rtn(ushort);
+ulong __ovld __cnfn convert_ulong_sat_rtn(ushort);
+ulong __ovld __cnfn convert_ulong(ushort);
+ulong __ovld __cnfn convert_ulong_sat(ushort);
+ulong __ovld __cnfn convert_ulong_rte(int);
+ulong __ovld __cnfn convert_ulong_sat_rte(int);
+ulong __ovld __cnfn convert_ulong_rtz(int);
+ulong __ovld __cnfn convert_ulong_sat_rtz(int);
+ulong __ovld __cnfn convert_ulong_rtp(int);
+ulong __ovld __cnfn convert_ulong_sat_rtp(int);
+ulong __ovld __cnfn convert_ulong_rtn(int);
+ulong __ovld __cnfn convert_ulong_sat_rtn(int);
+ulong __ovld __cnfn convert_ulong(int);
+ulong __ovld __cnfn convert_ulong_sat(int);
+ulong __ovld __cnfn convert_ulong_rte(uint);
+ulong __ovld __cnfn convert_ulong_sat_rte(uint);
+ulong __ovld __cnfn convert_ulong_rtz(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtz(uint);
+ulong __ovld __cnfn convert_ulong_rtp(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtp(uint);
+ulong __ovld __cnfn convert_ulong_rtn(uint);
+ulong __ovld __cnfn convert_ulong_sat_rtn(uint);
+ulong __ovld __cnfn convert_ulong(uint);
+ulong __ovld __cnfn convert_ulong_sat(uint);
+ulong __ovld __cnfn convert_ulong_rte(long);
+ulong __ovld __cnfn convert_ulong_sat_rte(long);
+ulong __ovld __cnfn convert_ulong_rtz(long);
+ulong __ovld __cnfn convert_ulong_sat_rtz(long);
+ulong __ovld __cnfn convert_ulong_rtp(long);
+ulong __ovld __cnfn convert_ulong_sat_rtp(long);
+ulong __ovld __cnfn convert_ulong_rtn(long);
+ulong __ovld __cnfn convert_ulong_sat_rtn(long);
+ulong __ovld __cnfn convert_ulong(long);
+ulong __ovld __cnfn convert_ulong_sat(long);
+ulong __ovld __cnfn convert_ulong_rte(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rte(ulong);
+ulong __ovld __cnfn convert_ulong_rtz(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtz(ulong);
+ulong __ovld __cnfn convert_ulong_rtp(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtp(ulong);
+ulong __ovld __cnfn convert_ulong_rtn(ulong);
+ulong __ovld __cnfn convert_ulong_sat_rtn(ulong);
+ulong __ovld __cnfn convert_ulong(ulong);
+ulong __ovld __cnfn convert_ulong_sat(ulong);
+ulong __ovld __cnfn convert_ulong_rte(float);
+ulong __ovld __cnfn convert_ulong_sat_rte(float);
+ulong __ovld __cnfn convert_ulong_rtz(float);
+ulong __ovld __cnfn convert_ulong_sat_rtz(float);
+ulong __ovld __cnfn convert_ulong_rtp(float);
+ulong __ovld __cnfn convert_ulong_sat_rtp(float);
+ulong __ovld __cnfn convert_ulong_rtn(float);
+ulong __ovld __cnfn convert_ulong_sat_rtn(float);
+ulong __ovld __cnfn convert_ulong(float);
+ulong __ovld __cnfn convert_ulong_sat(float);
+float __ovld __cnfn convert_float_rte(char);
+float __ovld __cnfn convert_float_rtz(char);
+float __ovld __cnfn convert_float_rtp(char);
+float __ovld __cnfn convert_float_rtn(char);
+float __ovld __cnfn convert_float(char);
+float __ovld __cnfn convert_float_rte(uchar);
+float __ovld __cnfn convert_float_rtz(uchar);
+float __ovld __cnfn convert_float_rtp(uchar);
+float __ovld __cnfn convert_float_rtn(uchar);
+float __ovld __cnfn convert_float(uchar);
+float __ovld __cnfn convert_float_rte(short);
+float __ovld __cnfn convert_float_rtz(short);
+float __ovld __cnfn convert_float_rtp(short);
+float __ovld __cnfn convert_float_rtn(short);
+float __ovld __cnfn convert_float(short);
+float __ovld __cnfn convert_float_rte(ushort);
+float __ovld __cnfn convert_float_rtz(ushort);
+float __ovld __cnfn convert_float_rtp(ushort);
+float __ovld __cnfn convert_float_rtn(ushort);
+float __ovld __cnfn convert_float(ushort);
+float __ovld __cnfn convert_float_rte(int);
+float __ovld __cnfn convert_float_rtz(int);
+float __ovld __cnfn convert_float_rtp(int);
+float __ovld __cnfn convert_float_rtn(int);
+float __ovld __cnfn convert_float(int);
+float __ovld __cnfn convert_float_rte(uint);
+float __ovld __cnfn convert_float_rtz(uint);
+float __ovld __cnfn convert_float_rtp(uint);
+float __ovld __cnfn convert_float_rtn(uint);
+float __ovld __cnfn convert_float(uint);
+float __ovld __cnfn convert_float_rte(long);
+float __ovld __cnfn convert_float_rtz(long);
+float __ovld __cnfn convert_float_rtp(long);
+float __ovld __cnfn convert_float_rtn(long);
+float __ovld __cnfn convert_float(long);
+float __ovld __cnfn convert_float_rte(ulong);
+float __ovld __cnfn convert_float_rtz(ulong);
+float __ovld __cnfn convert_float_rtp(ulong);
+float __ovld __cnfn convert_float_rtn(ulong);
+float __ovld __cnfn convert_float(ulong);
+float __ovld __cnfn convert_float_rte(float);
+float __ovld __cnfn convert_float_rtz(float);
+float __ovld __cnfn convert_float_rtp(float);
+float __ovld __cnfn convert_float_rtn(float);
+float __ovld __cnfn convert_float(float);
+char2 __ovld __cnfn convert_char2_rte(char2);
+char2 __ovld __cnfn convert_char2_sat_rte(char2);
+char2 __ovld __cnfn convert_char2_rtz(char2);
+char2 __ovld __cnfn convert_char2_sat_rtz(char2);
+char2 __ovld __cnfn convert_char2_rtp(char2);
+char2 __ovld __cnfn convert_char2_sat_rtp(char2);
+char2 __ovld __cnfn convert_char2_rtn(char2);
+char2 __ovld __cnfn convert_char2_sat_rtn(char2);
+char2 __ovld __cnfn convert_char2(char2);
+char2 __ovld __cnfn convert_char2_sat(char2);
+char2 __ovld __cnfn convert_char2_rte(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rte(uchar2);
+char2 __ovld __cnfn convert_char2_rtz(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtz(uchar2);
+char2 __ovld __cnfn convert_char2_rtp(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtp(uchar2);
+char2 __ovld __cnfn convert_char2_rtn(uchar2);
+char2 __ovld __cnfn convert_char2_sat_rtn(uchar2);
+char2 __ovld __cnfn convert_char2(uchar2);
+char2 __ovld __cnfn convert_char2_sat(uchar2);
+char2 __ovld __cnfn convert_char2_rte(short2);
+char2 __ovld __cnfn convert_char2_sat_rte(short2);
+char2 __ovld __cnfn convert_char2_rtz(short2);
+char2 __ovld __cnfn convert_char2_sat_rtz(short2);
+char2 __ovld __cnfn convert_char2_rtp(short2);
+char2 __ovld __cnfn convert_char2_sat_rtp(short2);
+char2 __ovld __cnfn convert_char2_rtn(short2);
+char2 __ovld __cnfn convert_char2_sat_rtn(short2);
+char2 __ovld __cnfn convert_char2(short2);
+char2 __ovld __cnfn convert_char2_sat(short2);
+char2 __ovld __cnfn convert_char2_rte(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rte(ushort2);
+char2 __ovld __cnfn convert_char2_rtz(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtz(ushort2);
+char2 __ovld __cnfn convert_char2_rtp(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtp(ushort2);
+char2 __ovld __cnfn convert_char2_rtn(ushort2);
+char2 __ovld __cnfn convert_char2_sat_rtn(ushort2);
+char2 __ovld __cnfn convert_char2(ushort2);
+char2 __ovld __cnfn convert_char2_sat(ushort2);
+char2 __ovld __cnfn convert_char2_rte(int2);
+char2 __ovld __cnfn convert_char2_sat_rte(int2);
+char2 __ovld __cnfn convert_char2_rtz(int2);
+char2 __ovld __cnfn convert_char2_sat_rtz(int2);
+char2 __ovld __cnfn convert_char2_rtp(int2);
+char2 __ovld __cnfn convert_char2_sat_rtp(int2);
+char2 __ovld __cnfn convert_char2_rtn(int2);
+char2 __ovld __cnfn convert_char2_sat_rtn(int2);
+char2 __ovld __cnfn convert_char2(int2);
+char2 __ovld __cnfn convert_char2_sat(int2);
+char2 __ovld __cnfn convert_char2_rte(uint2);
+char2 __ovld __cnfn convert_char2_sat_rte(uint2);
+char2 __ovld __cnfn convert_char2_rtz(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtz(uint2);
+char2 __ovld __cnfn convert_char2_rtp(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtp(uint2);
+char2 __ovld __cnfn convert_char2_rtn(uint2);
+char2 __ovld __cnfn convert_char2_sat_rtn(uint2);
+char2 __ovld __cnfn convert_char2(uint2);
+char2 __ovld __cnfn convert_char2_sat(uint2);
+char2 __ovld __cnfn convert_char2_rte(long2);
+char2 __ovld __cnfn convert_char2_sat_rte(long2);
+char2 __ovld __cnfn convert_char2_rtz(long2);
+char2 __ovld __cnfn convert_char2_sat_rtz(long2);
+char2 __ovld __cnfn convert_char2_rtp(long2);
+char2 __ovld __cnfn convert_char2_sat_rtp(long2);
+char2 __ovld __cnfn convert_char2_rtn(long2);
+char2 __ovld __cnfn convert_char2_sat_rtn(long2);
+char2 __ovld __cnfn convert_char2(long2);
+char2 __ovld __cnfn convert_char2_sat(long2);
+char2 __ovld __cnfn convert_char2_rte(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rte(ulong2);
+char2 __ovld __cnfn convert_char2_rtz(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtz(ulong2);
+char2 __ovld __cnfn convert_char2_rtp(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtp(ulong2);
+char2 __ovld __cnfn convert_char2_rtn(ulong2);
+char2 __ovld __cnfn convert_char2_sat_rtn(ulong2);
+char2 __ovld __cnfn convert_char2(ulong2);
+char2 __ovld __cnfn convert_char2_sat(ulong2);
+char2 __ovld __cnfn convert_char2_rte(float2);
+char2 __ovld __cnfn convert_char2_sat_rte(float2);
+char2 __ovld __cnfn convert_char2_rtz(float2);
+char2 __ovld __cnfn convert_char2_sat_rtz(float2);
+char2 __ovld __cnfn convert_char2_rtp(float2);
+char2 __ovld __cnfn convert_char2_sat_rtp(float2);
+char2 __ovld __cnfn convert_char2_rtn(float2);
+char2 __ovld __cnfn convert_char2_sat_rtn(float2);
+char2 __ovld __cnfn convert_char2(float2);
+char2 __ovld __cnfn convert_char2_sat(float2);
+uchar2 __ovld __cnfn convert_uchar2_rte(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(char2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(char2);
+uchar2 __ovld __cnfn convert_uchar2(char2);
+uchar2 __ovld __cnfn convert_uchar2_sat(char2);
+uchar2 __ovld __cnfn convert_uchar2_rte(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uchar2);
+uchar2 __ovld __cnfn convert_uchar2(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_sat(uchar2);
+uchar2 __ovld __cnfn convert_uchar2_rte(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(short2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(short2);
+uchar2 __ovld __cnfn convert_uchar2(short2);
+uchar2 __ovld __cnfn convert_uchar2_sat(short2);
+uchar2 __ovld __cnfn convert_uchar2_rte(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ushort2);
+uchar2 __ovld __cnfn convert_uchar2(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_sat(ushort2);
+uchar2 __ovld __cnfn convert_uchar2_rte(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(int2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(int2);
+uchar2 __ovld __cnfn convert_uchar2(int2);
+uchar2 __ovld __cnfn convert_uchar2_sat(int2);
+uchar2 __ovld __cnfn convert_uchar2_rte(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(uint2);
+uchar2 __ovld __cnfn convert_uchar2(uint2);
+uchar2 __ovld __cnfn convert_uchar2_sat(uint2);
+uchar2 __ovld __cnfn convert_uchar2_rte(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(long2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(long2);
+uchar2 __ovld __cnfn convert_uchar2(long2);
+uchar2 __ovld __cnfn convert_uchar2_sat(long2);
+uchar2 __ovld __cnfn convert_uchar2_rte(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(ulong2);
+uchar2 __ovld __cnfn convert_uchar2(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_sat(ulong2);
+uchar2 __ovld __cnfn convert_uchar2_rte(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(float2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(float2);
+uchar2 __ovld __cnfn convert_uchar2(float2);
+uchar2 __ovld __cnfn convert_uchar2_sat(float2);
+short2 __ovld __cnfn convert_short2_rte(char2);
+short2 __ovld __cnfn convert_short2_sat_rte(char2);
+short2 __ovld __cnfn convert_short2_rtz(char2);
+short2 __ovld __cnfn convert_short2_sat_rtz(char2);
+short2 __ovld __cnfn convert_short2_rtp(char2);
+short2 __ovld __cnfn convert_short2_sat_rtp(char2);
+short2 __ovld __cnfn convert_short2_rtn(char2);
+short2 __ovld __cnfn convert_short2_sat_rtn(char2);
+short2 __ovld __cnfn convert_short2(char2);
+short2 __ovld __cnfn convert_short2_sat(char2);
+short2 __ovld __cnfn convert_short2_rte(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rte(uchar2);
+short2 __ovld __cnfn convert_short2_rtz(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtz(uchar2);
+short2 __ovld __cnfn convert_short2_rtp(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtp(uchar2);
+short2 __ovld __cnfn convert_short2_rtn(uchar2);
+short2 __ovld __cnfn convert_short2_sat_rtn(uchar2);
+short2 __ovld __cnfn convert_short2(uchar2);
+short2 __ovld __cnfn convert_short2_sat(uchar2);
+short2 __ovld __cnfn convert_short2_rte(short2);
+short2 __ovld __cnfn convert_short2_sat_rte(short2);
+short2 __ovld __cnfn convert_short2_rtz(short2);
+short2 __ovld __cnfn convert_short2_sat_rtz(short2);
+short2 __ovld __cnfn convert_short2_rtp(short2);
+short2 __ovld __cnfn convert_short2_sat_rtp(short2);
+short2 __ovld __cnfn convert_short2_rtn(short2);
+short2 __ovld __cnfn convert_short2_sat_rtn(short2);
+short2 __ovld __cnfn convert_short2(short2);
+short2 __ovld __cnfn convert_short2_sat(short2);
+short2 __ovld __cnfn convert_short2_rte(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rte(ushort2);
+short2 __ovld __cnfn convert_short2_rtz(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtz(ushort2);
+short2 __ovld __cnfn convert_short2_rtp(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtp(ushort2);
+short2 __ovld __cnfn convert_short2_rtn(ushort2);
+short2 __ovld __cnfn convert_short2_sat_rtn(ushort2);
+short2 __ovld __cnfn convert_short2(ushort2);
+short2 __ovld __cnfn convert_short2_sat(ushort2);
+short2 __ovld __cnfn convert_short2_rte(int2);
+short2 __ovld __cnfn convert_short2_sat_rte(int2);
+short2 __ovld __cnfn convert_short2_rtz(int2);
+short2 __ovld __cnfn convert_short2_sat_rtz(int2);
+short2 __ovld __cnfn convert_short2_rtp(int2);
+short2 __ovld __cnfn convert_short2_sat_rtp(int2);
+short2 __ovld __cnfn convert_short2_rtn(int2);
+short2 __ovld __cnfn convert_short2_sat_rtn(int2);
+short2 __ovld __cnfn convert_short2(int2);
+short2 __ovld __cnfn convert_short2_sat(int2);
+short2 __ovld __cnfn convert_short2_rte(uint2);
+short2 __ovld __cnfn convert_short2_sat_rte(uint2);
+short2 __ovld __cnfn convert_short2_rtz(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtz(uint2);
+short2 __ovld __cnfn convert_short2_rtp(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtp(uint2);
+short2 __ovld __cnfn convert_short2_rtn(uint2);
+short2 __ovld __cnfn convert_short2_sat_rtn(uint2);
+short2 __ovld __cnfn convert_short2(uint2);
+short2 __ovld __cnfn convert_short2_sat(uint2);
+short2 __ovld __cnfn convert_short2_rte(long2);
+short2 __ovld __cnfn convert_short2_sat_rte(long2);
+short2 __ovld __cnfn convert_short2_rtz(long2);
+short2 __ovld __cnfn convert_short2_sat_rtz(long2);
+short2 __ovld __cnfn convert_short2_rtp(long2);
+short2 __ovld __cnfn convert_short2_sat_rtp(long2);
+short2 __ovld __cnfn convert_short2_rtn(long2);
+short2 __ovld __cnfn convert_short2_sat_rtn(long2);
+short2 __ovld __cnfn convert_short2(long2);
+short2 __ovld __cnfn convert_short2_sat(long2);
+short2 __ovld __cnfn convert_short2_rte(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rte(ulong2);
+short2 __ovld __cnfn convert_short2_rtz(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtz(ulong2);
+short2 __ovld __cnfn convert_short2_rtp(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtp(ulong2);
+short2 __ovld __cnfn convert_short2_rtn(ulong2);
+short2 __ovld __cnfn convert_short2_sat_rtn(ulong2);
+short2 __ovld __cnfn convert_short2(ulong2);
+short2 __ovld __cnfn convert_short2_sat(ulong2);
+short2 __ovld __cnfn convert_short2_rte(float2);
+short2 __ovld __cnfn convert_short2_sat_rte(float2);
+short2 __ovld __cnfn convert_short2_rtz(float2);
+short2 __ovld __cnfn convert_short2_sat_rtz(float2);
+short2 __ovld __cnfn convert_short2_rtp(float2);
+short2 __ovld __cnfn convert_short2_sat_rtp(float2);
+short2 __ovld __cnfn convert_short2_rtn(float2);
+short2 __ovld __cnfn convert_short2_sat_rtn(float2);
+short2 __ovld __cnfn convert_short2(float2);
+short2 __ovld __cnfn convert_short2_sat(float2);
+ushort2 __ovld __cnfn convert_ushort2_rte(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(char2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(char2);
+ushort2 __ovld __cnfn convert_ushort2(char2);
+ushort2 __ovld __cnfn convert_ushort2_sat(char2);
+ushort2 __ovld __cnfn convert_ushort2_rte(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uchar2);
+ushort2 __ovld __cnfn convert_ushort2(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_sat(uchar2);
+ushort2 __ovld __cnfn convert_ushort2_rte(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(short2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(short2);
+ushort2 __ovld __cnfn convert_ushort2(short2);
+ushort2 __ovld __cnfn convert_ushort2_sat(short2);
+ushort2 __ovld __cnfn convert_ushort2_rte(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ushort2);
+ushort2 __ovld __cnfn convert_ushort2(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_sat(ushort2);
+ushort2 __ovld __cnfn convert_ushort2_rte(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(int2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(int2);
+ushort2 __ovld __cnfn convert_ushort2(int2);
+ushort2 __ovld __cnfn convert_ushort2_sat(int2);
+ushort2 __ovld __cnfn convert_ushort2_rte(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(uint2);
+ushort2 __ovld __cnfn convert_ushort2(uint2);
+ushort2 __ovld __cnfn convert_ushort2_sat(uint2);
+ushort2 __ovld __cnfn convert_ushort2_rte(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(long2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(long2);
+ushort2 __ovld __cnfn convert_ushort2(long2);
+ushort2 __ovld __cnfn convert_ushort2_sat(long2);
+ushort2 __ovld __cnfn convert_ushort2_rte(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(ulong2);
+ushort2 __ovld __cnfn convert_ushort2(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_sat(ulong2);
+ushort2 __ovld __cnfn convert_ushort2_rte(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(float2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(float2);
+ushort2 __ovld __cnfn convert_ushort2(float2);
+ushort2 __ovld __cnfn convert_ushort2_sat(float2);
+int2 __ovld __cnfn convert_int2_rte(char2);
+int2 __ovld __cnfn convert_int2_sat_rte(char2);
+int2 __ovld __cnfn convert_int2_rtz(char2);
+int2 __ovld __cnfn convert_int2_sat_rtz(char2);
+int2 __ovld __cnfn convert_int2_rtp(char2);
+int2 __ovld __cnfn convert_int2_sat_rtp(char2);
+int2 __ovld __cnfn convert_int2_rtn(char2);
+int2 __ovld __cnfn convert_int2_sat_rtn(char2);
+int2 __ovld __cnfn convert_int2(char2);
+int2 __ovld __cnfn convert_int2_sat(char2);
+int2 __ovld __cnfn convert_int2_rte(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rte(uchar2);
+int2 __ovld __cnfn convert_int2_rtz(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtz(uchar2);
+int2 __ovld __cnfn convert_int2_rtp(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtp(uchar2);
+int2 __ovld __cnfn convert_int2_rtn(uchar2);
+int2 __ovld __cnfn convert_int2_sat_rtn(uchar2);
+int2 __ovld __cnfn convert_int2(uchar2);
+int2 __ovld __cnfn convert_int2_sat(uchar2);
+int2 __ovld __cnfn convert_int2_rte(short2);
+int2 __ovld __cnfn convert_int2_sat_rte(short2);
+int2 __ovld __cnfn convert_int2_rtz(short2);
+int2 __ovld __cnfn convert_int2_sat_rtz(short2);
+int2 __ovld __cnfn convert_int2_rtp(short2);
+int2 __ovld __cnfn convert_int2_sat_rtp(short2);
+int2 __ovld __cnfn convert_int2_rtn(short2);
+int2 __ovld __cnfn convert_int2_sat_rtn(short2);
+int2 __ovld __cnfn convert_int2(short2);
+int2 __ovld __cnfn convert_int2_sat(short2);
+int2 __ovld __cnfn convert_int2_rte(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rte(ushort2);
+int2 __ovld __cnfn convert_int2_rtz(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtz(ushort2);
+int2 __ovld __cnfn convert_int2_rtp(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtp(ushort2);
+int2 __ovld __cnfn convert_int2_rtn(ushort2);
+int2 __ovld __cnfn convert_int2_sat_rtn(ushort2);
+int2 __ovld __cnfn convert_int2(ushort2);
+int2 __ovld __cnfn convert_int2_sat(ushort2);
+int2 __ovld __cnfn convert_int2_rte(int2);
+int2 __ovld __cnfn convert_int2_sat_rte(int2);
+int2 __ovld __cnfn convert_int2_rtz(int2);
+int2 __ovld __cnfn convert_int2_sat_rtz(int2);
+int2 __ovld __cnfn convert_int2_rtp(int2);
+int2 __ovld __cnfn convert_int2_sat_rtp(int2);
+int2 __ovld __cnfn convert_int2_rtn(int2);
+int2 __ovld __cnfn convert_int2_sat_rtn(int2);
+int2 __ovld __cnfn convert_int2(int2);
+int2 __ovld __cnfn convert_int2_sat(int2);
+int2 __ovld __cnfn convert_int2_rte(uint2);
+int2 __ovld __cnfn convert_int2_sat_rte(uint2);
+int2 __ovld __cnfn convert_int2_rtz(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtz(uint2);
+int2 __ovld __cnfn convert_int2_rtp(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtp(uint2);
+int2 __ovld __cnfn convert_int2_rtn(uint2);
+int2 __ovld __cnfn convert_int2_sat_rtn(uint2);
+int2 __ovld __cnfn convert_int2(uint2);
+int2 __ovld __cnfn convert_int2_sat(uint2);
+int2 __ovld __cnfn convert_int2_rte(long2);
+int2 __ovld __cnfn convert_int2_sat_rte(long2);
+int2 __ovld __cnfn convert_int2_rtz(long2);
+int2 __ovld __cnfn convert_int2_sat_rtz(long2);
+int2 __ovld __cnfn convert_int2_rtp(long2);
+int2 __ovld __cnfn convert_int2_sat_rtp(long2);
+int2 __ovld __cnfn convert_int2_rtn(long2);
+int2 __ovld __cnfn convert_int2_sat_rtn(long2);
+int2 __ovld __cnfn convert_int2(long2);
+int2 __ovld __cnfn convert_int2_sat(long2);
+int2 __ovld __cnfn convert_int2_rte(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rte(ulong2);
+int2 __ovld __cnfn convert_int2_rtz(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtz(ulong2);
+int2 __ovld __cnfn convert_int2_rtp(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtp(ulong2);
+int2 __ovld __cnfn convert_int2_rtn(ulong2);
+int2 __ovld __cnfn convert_int2_sat_rtn(ulong2);
+int2 __ovld __cnfn convert_int2(ulong2);
+int2 __ovld __cnfn convert_int2_sat(ulong2);
+int2 __ovld __cnfn convert_int2_rte(float2);
+int2 __ovld __cnfn convert_int2_sat_rte(float2);
+int2 __ovld __cnfn convert_int2_rtz(float2);
+int2 __ovld __cnfn convert_int2_sat_rtz(float2);
+int2 __ovld __cnfn convert_int2_rtp(float2);
+int2 __ovld __cnfn convert_int2_sat_rtp(float2);
+int2 __ovld __cnfn convert_int2_rtn(float2);
+int2 __ovld __cnfn convert_int2_sat_rtn(float2);
+int2 __ovld __cnfn convert_int2(float2);
+int2 __ovld __cnfn convert_int2_sat(float2);
+uint2 __ovld __cnfn convert_uint2_rte(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(char2);
+uint2 __ovld __cnfn convert_uint2_rtz(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(char2);
+uint2 __ovld __cnfn convert_uint2_rtp(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(char2);
+uint2 __ovld __cnfn convert_uint2_rtn(char2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(char2);
+uint2 __ovld __cnfn convert_uint2(char2);
+uint2 __ovld __cnfn convert_uint2_sat(char2);
+uint2 __ovld __cnfn convert_uint2_rte(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtz(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtp(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(uchar2);
+uint2 __ovld __cnfn convert_uint2_rtn(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(uchar2);
+uint2 __ovld __cnfn convert_uint2(uchar2);
+uint2 __ovld __cnfn convert_uint2_sat(uchar2);
+uint2 __ovld __cnfn convert_uint2_rte(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(short2);
+uint2 __ovld __cnfn convert_uint2_rtz(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(short2);
+uint2 __ovld __cnfn convert_uint2_rtp(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(short2);
+uint2 __ovld __cnfn convert_uint2_rtn(short2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(short2);
+uint2 __ovld __cnfn convert_uint2(short2);
+uint2 __ovld __cnfn convert_uint2_sat(short2);
+uint2 __ovld __cnfn convert_uint2_rte(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtz(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtp(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(ushort2);
+uint2 __ovld __cnfn convert_uint2_rtn(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(ushort2);
+uint2 __ovld __cnfn convert_uint2(ushort2);
+uint2 __ovld __cnfn convert_uint2_sat(ushort2);
+uint2 __ovld __cnfn convert_uint2_rte(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(int2);
+uint2 __ovld __cnfn convert_uint2_rtz(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(int2);
+uint2 __ovld __cnfn convert_uint2_rtp(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(int2);
+uint2 __ovld __cnfn convert_uint2_rtn(int2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(int2);
+uint2 __ovld __cnfn convert_uint2(int2);
+uint2 __ovld __cnfn convert_uint2_sat(int2);
+uint2 __ovld __cnfn convert_uint2_rte(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(uint2);
+uint2 __ovld __cnfn convert_uint2_rtz(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(uint2);
+uint2 __ovld __cnfn convert_uint2_rtp(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(uint2);
+uint2 __ovld __cnfn convert_uint2_rtn(uint2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(uint2);
+uint2 __ovld __cnfn convert_uint2(uint2);
+uint2 __ovld __cnfn convert_uint2_sat(uint2);
+uint2 __ovld __cnfn convert_uint2_rte(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(long2);
+uint2 __ovld __cnfn convert_uint2_rtz(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(long2);
+uint2 __ovld __cnfn convert_uint2_rtp(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(long2);
+uint2 __ovld __cnfn convert_uint2_rtn(long2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(long2);
+uint2 __ovld __cnfn convert_uint2(long2);
+uint2 __ovld __cnfn convert_uint2_sat(long2);
+uint2 __ovld __cnfn convert_uint2_rte(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtz(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtp(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(ulong2);
+uint2 __ovld __cnfn convert_uint2_rtn(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(ulong2);
+uint2 __ovld __cnfn convert_uint2(ulong2);
+uint2 __ovld __cnfn convert_uint2_sat(ulong2);
+uint2 __ovld __cnfn convert_uint2_rte(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(float2);
+uint2 __ovld __cnfn convert_uint2_rtz(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(float2);
+uint2 __ovld __cnfn convert_uint2_rtp(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(float2);
+uint2 __ovld __cnfn convert_uint2_rtn(float2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(float2);
+uint2 __ovld __cnfn convert_uint2(float2);
+uint2 __ovld __cnfn convert_uint2_sat(float2);
+long2 __ovld __cnfn convert_long2_rte(char2);
+long2 __ovld __cnfn convert_long2_sat_rte(char2);
+long2 __ovld __cnfn convert_long2_rtz(char2);
+long2 __ovld __cnfn convert_long2_sat_rtz(char2);
+long2 __ovld __cnfn convert_long2_rtp(char2);
+long2 __ovld __cnfn convert_long2_sat_rtp(char2);
+long2 __ovld __cnfn convert_long2_rtn(char2);
+long2 __ovld __cnfn convert_long2_sat_rtn(char2);
+long2 __ovld __cnfn convert_long2(char2);
+long2 __ovld __cnfn convert_long2_sat(char2);
+long2 __ovld __cnfn convert_long2_rte(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rte(uchar2);
+long2 __ovld __cnfn convert_long2_rtz(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtz(uchar2);
+long2 __ovld __cnfn convert_long2_rtp(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtp(uchar2);
+long2 __ovld __cnfn convert_long2_rtn(uchar2);
+long2 __ovld __cnfn convert_long2_sat_rtn(uchar2);
+long2 __ovld __cnfn convert_long2(uchar2);
+long2 __ovld __cnfn convert_long2_sat(uchar2);
+long2 __ovld __cnfn convert_long2_rte(short2);
+long2 __ovld __cnfn convert_long2_sat_rte(short2);
+long2 __ovld __cnfn convert_long2_rtz(short2);
+long2 __ovld __cnfn convert_long2_sat_rtz(short2);
+long2 __ovld __cnfn convert_long2_rtp(short2);
+long2 __ovld __cnfn convert_long2_sat_rtp(short2);
+long2 __ovld __cnfn convert_long2_rtn(short2);
+long2 __ovld __cnfn convert_long2_sat_rtn(short2);
+long2 __ovld __cnfn convert_long2(short2);
+long2 __ovld __cnfn convert_long2_sat(short2);
+long2 __ovld __cnfn convert_long2_rte(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rte(ushort2);
+long2 __ovld __cnfn convert_long2_rtz(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtz(ushort2);
+long2 __ovld __cnfn convert_long2_rtp(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtp(ushort2);
+long2 __ovld __cnfn convert_long2_rtn(ushort2);
+long2 __ovld __cnfn convert_long2_sat_rtn(ushort2);
+long2 __ovld __cnfn convert_long2(ushort2);
+long2 __ovld __cnfn convert_long2_sat(ushort2);
+long2 __ovld __cnfn convert_long2_rte(int2);
+long2 __ovld __cnfn convert_long2_sat_rte(int2);
+long2 __ovld __cnfn convert_long2_rtz(int2);
+long2 __ovld __cnfn convert_long2_sat_rtz(int2);
+long2 __ovld __cnfn convert_long2_rtp(int2);
+long2 __ovld __cnfn convert_long2_sat_rtp(int2);
+long2 __ovld __cnfn convert_long2_rtn(int2);
+long2 __ovld __cnfn convert_long2_sat_rtn(int2);
+long2 __ovld __cnfn convert_long2(int2);
+long2 __ovld __cnfn convert_long2_sat(int2);
+long2 __ovld __cnfn convert_long2_rte(uint2);
+long2 __ovld __cnfn convert_long2_sat_rte(uint2);
+long2 __ovld __cnfn convert_long2_rtz(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtz(uint2);
+long2 __ovld __cnfn convert_long2_rtp(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtp(uint2);
+long2 __ovld __cnfn convert_long2_rtn(uint2);
+long2 __ovld __cnfn convert_long2_sat_rtn(uint2);
+long2 __ovld __cnfn convert_long2(uint2);
+long2 __ovld __cnfn convert_long2_sat(uint2);
+long2 __ovld __cnfn convert_long2_rte(long2);
+long2 __ovld __cnfn convert_long2_sat_rte(long2);
+long2 __ovld __cnfn convert_long2_rtz(long2);
+long2 __ovld __cnfn convert_long2_sat_rtz(long2);
+long2 __ovld __cnfn convert_long2_rtp(long2);
+long2 __ovld __cnfn convert_long2_sat_rtp(long2);
+long2 __ovld __cnfn convert_long2_rtn(long2);
+long2 __ovld __cnfn convert_long2_sat_rtn(long2);
+long2 __ovld __cnfn convert_long2(long2);
+long2 __ovld __cnfn convert_long2_sat(long2);
+long2 __ovld __cnfn convert_long2_rte(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rte(ulong2);
+long2 __ovld __cnfn convert_long2_rtz(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtz(ulong2);
+long2 __ovld __cnfn convert_long2_rtp(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtp(ulong2);
+long2 __ovld __cnfn convert_long2_rtn(ulong2);
+long2 __ovld __cnfn convert_long2_sat_rtn(ulong2);
+long2 __ovld __cnfn convert_long2(ulong2);
+long2 __ovld __cnfn convert_long2_sat(ulong2);
+long2 __ovld __cnfn convert_long2_rte(float2);
+long2 __ovld __cnfn convert_long2_sat_rte(float2);
+long2 __ovld __cnfn convert_long2_rtz(float2);
+long2 __ovld __cnfn convert_long2_sat_rtz(float2);
+long2 __ovld __cnfn convert_long2_rtp(float2);
+long2 __ovld __cnfn convert_long2_sat_rtp(float2);
+long2 __ovld __cnfn convert_long2_rtn(float2);
+long2 __ovld __cnfn convert_long2_sat_rtn(float2);
+long2 __ovld __cnfn convert_long2(float2);
+long2 __ovld __cnfn convert_long2_sat(float2);
+ulong2 __ovld __cnfn convert_ulong2_rte(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(char2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(char2);
+ulong2 __ovld __cnfn convert_ulong2(char2);
+ulong2 __ovld __cnfn convert_ulong2_sat(char2);
+ulong2 __ovld __cnfn convert_ulong2_rte(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uchar2);
+ulong2 __ovld __cnfn convert_ulong2(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_sat(uchar2);
+ulong2 __ovld __cnfn convert_ulong2_rte(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(short2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(short2);
+ulong2 __ovld __cnfn convert_ulong2(short2);
+ulong2 __ovld __cnfn convert_ulong2_sat(short2);
+ulong2 __ovld __cnfn convert_ulong2_rte(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ushort2);
+ulong2 __ovld __cnfn convert_ulong2(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_sat(ushort2);
+ulong2 __ovld __cnfn convert_ulong2_rte(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(int2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(int2);
+ulong2 __ovld __cnfn convert_ulong2(int2);
+ulong2 __ovld __cnfn convert_ulong2_sat(int2);
+ulong2 __ovld __cnfn convert_ulong2_rte(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(uint2);
+ulong2 __ovld __cnfn convert_ulong2(uint2);
+ulong2 __ovld __cnfn convert_ulong2_sat(uint2);
+ulong2 __ovld __cnfn convert_ulong2_rte(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(long2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(long2);
+ulong2 __ovld __cnfn convert_ulong2(long2);
+ulong2 __ovld __cnfn convert_ulong2_sat(long2);
+ulong2 __ovld __cnfn convert_ulong2_rte(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(ulong2);
+ulong2 __ovld __cnfn convert_ulong2(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_sat(ulong2);
+ulong2 __ovld __cnfn convert_ulong2_rte(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(float2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(float2);
+ulong2 __ovld __cnfn convert_ulong2(float2);
+ulong2 __ovld __cnfn convert_ulong2_sat(float2);
+float2 __ovld __cnfn convert_float2_rte(char2);
+float2 __ovld __cnfn convert_float2_rtz(char2);
+float2 __ovld __cnfn convert_float2_rtp(char2);
+float2 __ovld __cnfn convert_float2_rtn(char2);
+float2 __ovld __cnfn convert_float2(char2);
+float2 __ovld __cnfn convert_float2_rte(uchar2);
+float2 __ovld __cnfn convert_float2_rtz(uchar2);
+float2 __ovld __cnfn convert_float2_rtp(uchar2);
+float2 __ovld __cnfn convert_float2_rtn(uchar2);
+float2 __ovld __cnfn convert_float2(uchar2);
+float2 __ovld __cnfn convert_float2_rte(short2);
+float2 __ovld __cnfn convert_float2_rtz(short2);
+float2 __ovld __cnfn convert_float2_rtp(short2);
+float2 __ovld __cnfn convert_float2_rtn(short2);
+float2 __ovld __cnfn convert_float2(short2);
+float2 __ovld __cnfn convert_float2_rte(ushort2);
+float2 __ovld __cnfn convert_float2_rtz(ushort2);
+float2 __ovld __cnfn convert_float2_rtp(ushort2);
+float2 __ovld __cnfn convert_float2_rtn(ushort2);
+float2 __ovld __cnfn convert_float2(ushort2);
+float2 __ovld __cnfn convert_float2_rte(int2);
+float2 __ovld __cnfn convert_float2_rtz(int2);
+float2 __ovld __cnfn convert_float2_rtp(int2);
+float2 __ovld __cnfn convert_float2_rtn(int2);
+float2 __ovld __cnfn convert_float2(int2);
+float2 __ovld __cnfn convert_float2_rte(uint2);
+float2 __ovld __cnfn convert_float2_rtz(uint2);
+float2 __ovld __cnfn convert_float2_rtp(uint2);
+float2 __ovld __cnfn convert_float2_rtn(uint2);
+float2 __ovld __cnfn convert_float2(uint2);
+float2 __ovld __cnfn convert_float2_rte(long2);
+float2 __ovld __cnfn convert_float2_rtz(long2);
+float2 __ovld __cnfn convert_float2_rtp(long2);
+float2 __ovld __cnfn convert_float2_rtn(long2);
+float2 __ovld __cnfn convert_float2(long2);
+float2 __ovld __cnfn convert_float2_rte(ulong2);
+float2 __ovld __cnfn convert_float2_rtz(ulong2);
+float2 __ovld __cnfn convert_float2_rtp(ulong2);
+float2 __ovld __cnfn convert_float2_rtn(ulong2);
+float2 __ovld __cnfn convert_float2(ulong2);
+float2 __ovld __cnfn convert_float2_rte(float2);
+float2 __ovld __cnfn convert_float2_rtz(float2);
+float2 __ovld __cnfn convert_float2_rtp(float2);
+float2 __ovld __cnfn convert_float2_rtn(float2);
+float2 __ovld __cnfn convert_float2(float2);
+char3 __ovld __cnfn convert_char3_rte(char3);
+char3 __ovld __cnfn convert_char3_sat_rte(char3);
+char3 __ovld __cnfn convert_char3_rtz(char3);
+char3 __ovld __cnfn convert_char3_sat_rtz(char3);
+char3 __ovld __cnfn convert_char3_rtp(char3);
+char3 __ovld __cnfn convert_char3_sat_rtp(char3);
+char3 __ovld __cnfn convert_char3_rtn(char3);
+char3 __ovld __cnfn convert_char3_sat_rtn(char3);
+char3 __ovld __cnfn convert_char3(char3);
+char3 __ovld __cnfn convert_char3_sat(char3);
+char3 __ovld __cnfn convert_char3_rte(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rte(uchar3);
+char3 __ovld __cnfn convert_char3_rtz(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtz(uchar3);
+char3 __ovld __cnfn convert_char3_rtp(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtp(uchar3);
+char3 __ovld __cnfn convert_char3_rtn(uchar3);
+char3 __ovld __cnfn convert_char3_sat_rtn(uchar3);
+char3 __ovld __cnfn convert_char3(uchar3);
+char3 __ovld __cnfn convert_char3_sat(uchar3);
+char3 __ovld __cnfn convert_char3_rte(short3);
+char3 __ovld __cnfn convert_char3_sat_rte(short3);
+char3 __ovld __cnfn convert_char3_rtz(short3);
+char3 __ovld __cnfn convert_char3_sat_rtz(short3);
+char3 __ovld __cnfn convert_char3_rtp(short3);
+char3 __ovld __cnfn convert_char3_sat_rtp(short3);
+char3 __ovld __cnfn convert_char3_rtn(short3);
+char3 __ovld __cnfn convert_char3_sat_rtn(short3);
+char3 __ovld __cnfn convert_char3(short3);
+char3 __ovld __cnfn convert_char3_sat(short3);
+char3 __ovld __cnfn convert_char3_rte(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rte(ushort3);
+char3 __ovld __cnfn convert_char3_rtz(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtz(ushort3);
+char3 __ovld __cnfn convert_char3_rtp(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtp(ushort3);
+char3 __ovld __cnfn convert_char3_rtn(ushort3);
+char3 __ovld __cnfn convert_char3_sat_rtn(ushort3);
+char3 __ovld __cnfn convert_char3(ushort3);
+char3 __ovld __cnfn convert_char3_sat(ushort3);
+char3 __ovld __cnfn convert_char3_rte(int3);
+char3 __ovld __cnfn convert_char3_sat_rte(int3);
+char3 __ovld __cnfn convert_char3_rtz(int3);
+char3 __ovld __cnfn convert_char3_sat_rtz(int3);
+char3 __ovld __cnfn convert_char3_rtp(int3);
+char3 __ovld __cnfn convert_char3_sat_rtp(int3);
+char3 __ovld __cnfn convert_char3_rtn(int3);
+char3 __ovld __cnfn convert_char3_sat_rtn(int3);
+char3 __ovld __cnfn convert_char3(int3);
+char3 __ovld __cnfn convert_char3_sat(int3);
+char3 __ovld __cnfn convert_char3_rte(uint3);
+char3 __ovld __cnfn convert_char3_sat_rte(uint3);
+char3 __ovld __cnfn convert_char3_rtz(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtz(uint3);
+char3 __ovld __cnfn convert_char3_rtp(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtp(uint3);
+char3 __ovld __cnfn convert_char3_rtn(uint3);
+char3 __ovld __cnfn convert_char3_sat_rtn(uint3);
+char3 __ovld __cnfn convert_char3(uint3);
+char3 __ovld __cnfn convert_char3_sat(uint3);
+char3 __ovld __cnfn convert_char3_rte(long3);
+char3 __ovld __cnfn convert_char3_sat_rte(long3);
+char3 __ovld __cnfn convert_char3_rtz(long3);
+char3 __ovld __cnfn convert_char3_sat_rtz(long3);
+char3 __ovld __cnfn convert_char3_rtp(long3);
+char3 __ovld __cnfn convert_char3_sat_rtp(long3);
+char3 __ovld __cnfn convert_char3_rtn(long3);
+char3 __ovld __cnfn convert_char3_sat_rtn(long3);
+char3 __ovld __cnfn convert_char3(long3);
+char3 __ovld __cnfn convert_char3_sat(long3);
+char3 __ovld __cnfn convert_char3_rte(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rte(ulong3);
+char3 __ovld __cnfn convert_char3_rtz(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtz(ulong3);
+char3 __ovld __cnfn convert_char3_rtp(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtp(ulong3);
+char3 __ovld __cnfn convert_char3_rtn(ulong3);
+char3 __ovld __cnfn convert_char3_sat_rtn(ulong3);
+char3 __ovld __cnfn convert_char3(ulong3);
+char3 __ovld __cnfn convert_char3_sat(ulong3);
+char3 __ovld __cnfn convert_char3_rte(float3);
+char3 __ovld __cnfn convert_char3_sat_rte(float3);
+char3 __ovld __cnfn convert_char3_rtz(float3);
+char3 __ovld __cnfn convert_char3_sat_rtz(float3);
+char3 __ovld __cnfn convert_char3_rtp(float3);
+char3 __ovld __cnfn convert_char3_sat_rtp(float3);
+char3 __ovld __cnfn convert_char3_rtn(float3);
+char3 __ovld __cnfn convert_char3_sat_rtn(float3);
+char3 __ovld __cnfn convert_char3(float3);
+char3 __ovld __cnfn convert_char3_sat(float3);
+uchar3 __ovld __cnfn convert_uchar3_rte(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(char3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(char3);
+uchar3 __ovld __cnfn convert_uchar3(char3);
+uchar3 __ovld __cnfn convert_uchar3_sat(char3);
+uchar3 __ovld __cnfn convert_uchar3_rte(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uchar3);
+uchar3 __ovld __cnfn convert_uchar3(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_sat(uchar3);
+uchar3 __ovld __cnfn convert_uchar3_rte(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(short3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(short3);
+uchar3 __ovld __cnfn convert_uchar3(short3);
+uchar3 __ovld __cnfn convert_uchar3_sat(short3);
+uchar3 __ovld __cnfn convert_uchar3_rte(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ushort3);
+uchar3 __ovld __cnfn convert_uchar3(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_sat(ushort3);
+uchar3 __ovld __cnfn convert_uchar3_rte(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(int3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(int3);
+uchar3 __ovld __cnfn convert_uchar3(int3);
+uchar3 __ovld __cnfn convert_uchar3_sat(int3);
+uchar3 __ovld __cnfn convert_uchar3_rte(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(uint3);
+uchar3 __ovld __cnfn convert_uchar3(uint3);
+uchar3 __ovld __cnfn convert_uchar3_sat(uint3);
+uchar3 __ovld __cnfn convert_uchar3_rte(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(long3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(long3);
+uchar3 __ovld __cnfn convert_uchar3(long3);
+uchar3 __ovld __cnfn convert_uchar3_sat(long3);
+uchar3 __ovld __cnfn convert_uchar3_rte(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(ulong3);
+uchar3 __ovld __cnfn convert_uchar3(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_sat(ulong3);
+uchar3 __ovld __cnfn convert_uchar3_rte(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(float3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(float3);
+uchar3 __ovld __cnfn convert_uchar3(float3);
+uchar3 __ovld __cnfn convert_uchar3_sat(float3);
+short3 __ovld __cnfn convert_short3_rte(char3);
+short3 __ovld __cnfn convert_short3_sat_rte(char3);
+short3 __ovld __cnfn convert_short3_rtz(char3);
+short3 __ovld __cnfn convert_short3_sat_rtz(char3);
+short3 __ovld __cnfn convert_short3_rtp(char3);
+short3 __ovld __cnfn convert_short3_sat_rtp(char3);
+short3 __ovld __cnfn convert_short3_rtn(char3);
+short3 __ovld __cnfn convert_short3_sat_rtn(char3);
+short3 __ovld __cnfn convert_short3(char3);
+short3 __ovld __cnfn convert_short3_sat(char3);
+short3 __ovld __cnfn convert_short3_rte(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rte(uchar3);
+short3 __ovld __cnfn convert_short3_rtz(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtz(uchar3);
+short3 __ovld __cnfn convert_short3_rtp(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtp(uchar3);
+short3 __ovld __cnfn convert_short3_rtn(uchar3);
+short3 __ovld __cnfn convert_short3_sat_rtn(uchar3);
+short3 __ovld __cnfn convert_short3(uchar3);
+short3 __ovld __cnfn convert_short3_sat(uchar3);
+short3 __ovld __cnfn convert_short3_rte(short3);
+short3 __ovld __cnfn convert_short3_sat_rte(short3);
+short3 __ovld __cnfn convert_short3_rtz(short3);
+short3 __ovld __cnfn convert_short3_sat_rtz(short3);
+short3 __ovld __cnfn convert_short3_rtp(short3);
+short3 __ovld __cnfn convert_short3_sat_rtp(short3);
+short3 __ovld __cnfn convert_short3_rtn(short3);
+short3 __ovld __cnfn convert_short3_sat_rtn(short3);
+short3 __ovld __cnfn convert_short3(short3);
+short3 __ovld __cnfn convert_short3_sat(short3);
+short3 __ovld __cnfn convert_short3_rte(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rte(ushort3);
+short3 __ovld __cnfn convert_short3_rtz(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtz(ushort3);
+short3 __ovld __cnfn convert_short3_rtp(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtp(ushort3);
+short3 __ovld __cnfn convert_short3_rtn(ushort3);
+short3 __ovld __cnfn convert_short3_sat_rtn(ushort3);
+short3 __ovld __cnfn convert_short3(ushort3);
+short3 __ovld __cnfn convert_short3_sat(ushort3);
+short3 __ovld __cnfn convert_short3_rte(int3);
+short3 __ovld __cnfn convert_short3_sat_rte(int3);
+short3 __ovld __cnfn convert_short3_rtz(int3);
+short3 __ovld __cnfn convert_short3_sat_rtz(int3);
+short3 __ovld __cnfn convert_short3_rtp(int3);
+short3 __ovld __cnfn convert_short3_sat_rtp(int3);
+short3 __ovld __cnfn convert_short3_rtn(int3);
+short3 __ovld __cnfn convert_short3_sat_rtn(int3);
+short3 __ovld __cnfn convert_short3(int3);
+short3 __ovld __cnfn convert_short3_sat(int3);
+short3 __ovld __cnfn convert_short3_rte(uint3);
+short3 __ovld __cnfn convert_short3_sat_rte(uint3);
+short3 __ovld __cnfn convert_short3_rtz(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtz(uint3);
+short3 __ovld __cnfn convert_short3_rtp(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtp(uint3);
+short3 __ovld __cnfn convert_short3_rtn(uint3);
+short3 __ovld __cnfn convert_short3_sat_rtn(uint3);
+short3 __ovld __cnfn convert_short3(uint3);
+short3 __ovld __cnfn convert_short3_sat(uint3);
+short3 __ovld __cnfn convert_short3_rte(long3);
+short3 __ovld __cnfn convert_short3_sat_rte(long3);
+short3 __ovld __cnfn convert_short3_rtz(long3);
+short3 __ovld __cnfn convert_short3_sat_rtz(long3);
+short3 __ovld __cnfn convert_short3_rtp(long3);
+short3 __ovld __cnfn convert_short3_sat_rtp(long3);
+short3 __ovld __cnfn convert_short3_rtn(long3);
+short3 __ovld __cnfn convert_short3_sat_rtn(long3);
+short3 __ovld __cnfn convert_short3(long3);
+short3 __ovld __cnfn convert_short3_sat(long3);
+short3 __ovld __cnfn convert_short3_rte(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rte(ulong3);
+short3 __ovld __cnfn convert_short3_rtz(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtz(ulong3);
+short3 __ovld __cnfn convert_short3_rtp(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtp(ulong3);
+short3 __ovld __cnfn convert_short3_rtn(ulong3);
+short3 __ovld __cnfn convert_short3_sat_rtn(ulong3);
+short3 __ovld __cnfn convert_short3(ulong3);
+short3 __ovld __cnfn convert_short3_sat(ulong3);
+short3 __ovld __cnfn convert_short3_rte(float3);
+short3 __ovld __cnfn convert_short3_sat_rte(float3);
+short3 __ovld __cnfn convert_short3_rtz(float3);
+short3 __ovld __cnfn convert_short3_sat_rtz(float3);
+short3 __ovld __cnfn convert_short3_rtp(float3);
+short3 __ovld __cnfn convert_short3_sat_rtp(float3);
+short3 __ovld __cnfn convert_short3_rtn(float3);
+short3 __ovld __cnfn convert_short3_sat_rtn(float3);
+short3 __ovld __cnfn convert_short3(float3);
+short3 __ovld __cnfn convert_short3_sat(float3);
+ushort3 __ovld __cnfn convert_ushort3_rte(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(char3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(char3);
+ushort3 __ovld __cnfn convert_ushort3(char3);
+ushort3 __ovld __cnfn convert_ushort3_sat(char3);
+ushort3 __ovld __cnfn convert_ushort3_rte(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uchar3);
+ushort3 __ovld __cnfn convert_ushort3(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_sat(uchar3);
+ushort3 __ovld __cnfn convert_ushort3_rte(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(short3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(short3);
+ushort3 __ovld __cnfn convert_ushort3(short3);
+ushort3 __ovld __cnfn convert_ushort3_sat(short3);
+ushort3 __ovld __cnfn convert_ushort3_rte(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ushort3);
+ushort3 __ovld __cnfn convert_ushort3(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_sat(ushort3);
+ushort3 __ovld __cnfn convert_ushort3_rte(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(int3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(int3);
+ushort3 __ovld __cnfn convert_ushort3(int3);
+ushort3 __ovld __cnfn convert_ushort3_sat(int3);
+ushort3 __ovld __cnfn convert_ushort3_rte(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(uint3);
+ushort3 __ovld __cnfn convert_ushort3(uint3);
+ushort3 __ovld __cnfn convert_ushort3_sat(uint3);
+ushort3 __ovld __cnfn convert_ushort3_rte(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(long3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(long3);
+ushort3 __ovld __cnfn convert_ushort3(long3);
+ushort3 __ovld __cnfn convert_ushort3_sat(long3);
+ushort3 __ovld __cnfn convert_ushort3_rte(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(ulong3);
+ushort3 __ovld __cnfn convert_ushort3(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_sat(ulong3);
+ushort3 __ovld __cnfn convert_ushort3_rte(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(float3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(float3);
+ushort3 __ovld __cnfn convert_ushort3(float3);
+ushort3 __ovld __cnfn convert_ushort3_sat(float3);
+int3 __ovld __cnfn convert_int3_rte(char3);
+int3 __ovld __cnfn convert_int3_sat_rte(char3);
+int3 __ovld __cnfn convert_int3_rtz(char3);
+int3 __ovld __cnfn convert_int3_sat_rtz(char3);
+int3 __ovld __cnfn convert_int3_rtp(char3);
+int3 __ovld __cnfn convert_int3_sat_rtp(char3);
+int3 __ovld __cnfn convert_int3_rtn(char3);
+int3 __ovld __cnfn convert_int3_sat_rtn(char3);
+int3 __ovld __cnfn convert_int3(char3);
+int3 __ovld __cnfn convert_int3_sat(char3);
+int3 __ovld __cnfn convert_int3_rte(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rte(uchar3);
+int3 __ovld __cnfn convert_int3_rtz(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtz(uchar3);
+int3 __ovld __cnfn convert_int3_rtp(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtp(uchar3);
+int3 __ovld __cnfn convert_int3_rtn(uchar3);
+int3 __ovld __cnfn convert_int3_sat_rtn(uchar3);
+int3 __ovld __cnfn convert_int3(uchar3);
+int3 __ovld __cnfn convert_int3_sat(uchar3);
+int3 __ovld __cnfn convert_int3_rte(short3);
+int3 __ovld __cnfn convert_int3_sat_rte(short3);
+int3 __ovld __cnfn convert_int3_rtz(short3);
+int3 __ovld __cnfn convert_int3_sat_rtz(short3);
+int3 __ovld __cnfn convert_int3_rtp(short3);
+int3 __ovld __cnfn convert_int3_sat_rtp(short3);
+int3 __ovld __cnfn convert_int3_rtn(short3);
+int3 __ovld __cnfn convert_int3_sat_rtn(short3);
+int3 __ovld __cnfn convert_int3(short3);
+int3 __ovld __cnfn convert_int3_sat(short3);
+int3 __ovld __cnfn convert_int3_rte(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rte(ushort3);
+int3 __ovld __cnfn convert_int3_rtz(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtz(ushort3);
+int3 __ovld __cnfn convert_int3_rtp(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtp(ushort3);
+int3 __ovld __cnfn convert_int3_rtn(ushort3);
+int3 __ovld __cnfn convert_int3_sat_rtn(ushort3);
+int3 __ovld __cnfn convert_int3(ushort3);
+int3 __ovld __cnfn convert_int3_sat(ushort3);
+int3 __ovld __cnfn convert_int3_rte(int3);
+int3 __ovld __cnfn convert_int3_sat_rte(int3);
+int3 __ovld __cnfn convert_int3_rtz(int3);
+int3 __ovld __cnfn convert_int3_sat_rtz(int3);
+int3 __ovld __cnfn convert_int3_rtp(int3);
+int3 __ovld __cnfn convert_int3_sat_rtp(int3);
+int3 __ovld __cnfn convert_int3_rtn(int3);
+int3 __ovld __cnfn convert_int3_sat_rtn(int3);
+int3 __ovld __cnfn convert_int3(int3);
+int3 __ovld __cnfn convert_int3_sat(int3);
+int3 __ovld __cnfn convert_int3_rte(uint3);
+int3 __ovld __cnfn convert_int3_sat_rte(uint3);
+int3 __ovld __cnfn convert_int3_rtz(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtz(uint3);
+int3 __ovld __cnfn convert_int3_rtp(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtp(uint3);
+int3 __ovld __cnfn convert_int3_rtn(uint3);
+int3 __ovld __cnfn convert_int3_sat_rtn(uint3);
+int3 __ovld __cnfn convert_int3(uint3);
+int3 __ovld __cnfn convert_int3_sat(uint3);
+int3 __ovld __cnfn convert_int3_rte(long3);
+int3 __ovld __cnfn convert_int3_sat_rte(long3);
+int3 __ovld __cnfn convert_int3_rtz(long3);
+int3 __ovld __cnfn convert_int3_sat_rtz(long3);
+int3 __ovld __cnfn convert_int3_rtp(long3);
+int3 __ovld __cnfn convert_int3_sat_rtp(long3);
+int3 __ovld __cnfn convert_int3_rtn(long3);
+int3 __ovld __cnfn convert_int3_sat_rtn(long3);
+int3 __ovld __cnfn convert_int3(long3);
+int3 __ovld __cnfn convert_int3_sat(long3);
+int3 __ovld __cnfn convert_int3_rte(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rte(ulong3);
+int3 __ovld __cnfn convert_int3_rtz(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtz(ulong3);
+int3 __ovld __cnfn convert_int3_rtp(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtp(ulong3);
+int3 __ovld __cnfn convert_int3_rtn(ulong3);
+int3 __ovld __cnfn convert_int3_sat_rtn(ulong3);
+int3 __ovld __cnfn convert_int3(ulong3);
+int3 __ovld __cnfn convert_int3_sat(ulong3);
+int3 __ovld __cnfn convert_int3_rte(float3);
+int3 __ovld __cnfn convert_int3_sat_rte(float3);
+int3 __ovld __cnfn convert_int3_rtz(float3);
+int3 __ovld __cnfn convert_int3_sat_rtz(float3);
+int3 __ovld __cnfn convert_int3_rtp(float3);
+int3 __ovld __cnfn convert_int3_sat_rtp(float3);
+int3 __ovld __cnfn convert_int3_rtn(float3);
+int3 __ovld __cnfn convert_int3_sat_rtn(float3);
+int3 __ovld __cnfn convert_int3(float3);
+int3 __ovld __cnfn convert_int3_sat(float3);
+uint3 __ovld __cnfn convert_uint3_rte(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(char3);
+uint3 __ovld __cnfn convert_uint3_rtz(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(char3);
+uint3 __ovld __cnfn convert_uint3_rtp(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(char3);
+uint3 __ovld __cnfn convert_uint3_rtn(char3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(char3);
+uint3 __ovld __cnfn convert_uint3(char3);
+uint3 __ovld __cnfn convert_uint3_sat(char3);
+uint3 __ovld __cnfn convert_uint3_rte(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtz(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtp(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(uchar3);
+uint3 __ovld __cnfn convert_uint3_rtn(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(uchar3);
+uint3 __ovld __cnfn convert_uint3(uchar3);
+uint3 __ovld __cnfn convert_uint3_sat(uchar3);
+uint3 __ovld __cnfn convert_uint3_rte(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(short3);
+uint3 __ovld __cnfn convert_uint3_rtz(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(short3);
+uint3 __ovld __cnfn convert_uint3_rtp(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(short3);
+uint3 __ovld __cnfn convert_uint3_rtn(short3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(short3);
+uint3 __ovld __cnfn convert_uint3(short3);
+uint3 __ovld __cnfn convert_uint3_sat(short3);
+uint3 __ovld __cnfn convert_uint3_rte(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtz(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtp(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(ushort3);
+uint3 __ovld __cnfn convert_uint3_rtn(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(ushort3);
+uint3 __ovld __cnfn convert_uint3(ushort3);
+uint3 __ovld __cnfn convert_uint3_sat(ushort3);
+uint3 __ovld __cnfn convert_uint3_rte(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(int3);
+uint3 __ovld __cnfn convert_uint3_rtz(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(int3);
+uint3 __ovld __cnfn convert_uint3_rtp(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(int3);
+uint3 __ovld __cnfn convert_uint3_rtn(int3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(int3);
+uint3 __ovld __cnfn convert_uint3(int3);
+uint3 __ovld __cnfn convert_uint3_sat(int3);
+uint3 __ovld __cnfn convert_uint3_rte(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(uint3);
+uint3 __ovld __cnfn convert_uint3_rtz(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(uint3);
+uint3 __ovld __cnfn convert_uint3_rtp(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(uint3);
+uint3 __ovld __cnfn convert_uint3_rtn(uint3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(uint3);
+uint3 __ovld __cnfn convert_uint3(uint3);
+uint3 __ovld __cnfn convert_uint3_sat(uint3);
+uint3 __ovld __cnfn convert_uint3_rte(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(long3);
+uint3 __ovld __cnfn convert_uint3_rtz(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(long3);
+uint3 __ovld __cnfn convert_uint3_rtp(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(long3);
+uint3 __ovld __cnfn convert_uint3_rtn(long3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(long3);
+uint3 __ovld __cnfn convert_uint3(long3);
+uint3 __ovld __cnfn convert_uint3_sat(long3);
+uint3 __ovld __cnfn convert_uint3_rte(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtz(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtp(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(ulong3);
+uint3 __ovld __cnfn convert_uint3_rtn(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(ulong3);
+uint3 __ovld __cnfn convert_uint3(ulong3);
+uint3 __ovld __cnfn convert_uint3_sat(ulong3);
+uint3 __ovld __cnfn convert_uint3_rte(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(float3);
+uint3 __ovld __cnfn convert_uint3_rtz(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(float3);
+uint3 __ovld __cnfn convert_uint3_rtp(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(float3);
+uint3 __ovld __cnfn convert_uint3_rtn(float3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(float3);
+uint3 __ovld __cnfn convert_uint3(float3);
+uint3 __ovld __cnfn convert_uint3_sat(float3);
+long3 __ovld __cnfn convert_long3_rte(char3);
+long3 __ovld __cnfn convert_long3_sat_rte(char3);
+long3 __ovld __cnfn convert_long3_rtz(char3);
+long3 __ovld __cnfn convert_long3_sat_rtz(char3);
+long3 __ovld __cnfn convert_long3_rtp(char3);
+long3 __ovld __cnfn convert_long3_sat_rtp(char3);
+long3 __ovld __cnfn convert_long3_rtn(char3);
+long3 __ovld __cnfn convert_long3_sat_rtn(char3);
+long3 __ovld __cnfn convert_long3(char3);
+long3 __ovld __cnfn convert_long3_sat(char3);
+long3 __ovld __cnfn convert_long3_rte(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rte(uchar3);
+long3 __ovld __cnfn convert_long3_rtz(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtz(uchar3);
+long3 __ovld __cnfn convert_long3_rtp(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtp(uchar3);
+long3 __ovld __cnfn convert_long3_rtn(uchar3);
+long3 __ovld __cnfn convert_long3_sat_rtn(uchar3);
+long3 __ovld __cnfn convert_long3(uchar3);
+long3 __ovld __cnfn convert_long3_sat(uchar3);
+long3 __ovld __cnfn convert_long3_rte(short3);
+long3 __ovld __cnfn convert_long3_sat_rte(short3);
+long3 __ovld __cnfn convert_long3_rtz(short3);
+long3 __ovld __cnfn convert_long3_sat_rtz(short3);
+long3 __ovld __cnfn convert_long3_rtp(short3);
+long3 __ovld __cnfn convert_long3_sat_rtp(short3);
+long3 __ovld __cnfn convert_long3_rtn(short3);
+long3 __ovld __cnfn convert_long3_sat_rtn(short3);
+long3 __ovld __cnfn convert_long3(short3);
+long3 __ovld __cnfn convert_long3_sat(short3);
+long3 __ovld __cnfn convert_long3_rte(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rte(ushort3);
+long3 __ovld __cnfn convert_long3_rtz(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtz(ushort3);
+long3 __ovld __cnfn convert_long3_rtp(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtp(ushort3);
+long3 __ovld __cnfn convert_long3_rtn(ushort3);
+long3 __ovld __cnfn convert_long3_sat_rtn(ushort3);
+long3 __ovld __cnfn convert_long3(ushort3);
+long3 __ovld __cnfn convert_long3_sat(ushort3);
+long3 __ovld __cnfn convert_long3_rte(int3);
+long3 __ovld __cnfn convert_long3_sat_rte(int3);
+long3 __ovld __cnfn convert_long3_rtz(int3);
+long3 __ovld __cnfn convert_long3_sat_rtz(int3);
+long3 __ovld __cnfn convert_long3_rtp(int3);
+long3 __ovld __cnfn convert_long3_sat_rtp(int3);
+long3 __ovld __cnfn convert_long3_rtn(int3);
+long3 __ovld __cnfn convert_long3_sat_rtn(int3);
+long3 __ovld __cnfn convert_long3(int3);
+long3 __ovld __cnfn convert_long3_sat(int3);
+long3 __ovld __cnfn convert_long3_rte(uint3);
+long3 __ovld __cnfn convert_long3_sat_rte(uint3);
+long3 __ovld __cnfn convert_long3_rtz(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtz(uint3);
+long3 __ovld __cnfn convert_long3_rtp(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtp(uint3);
+long3 __ovld __cnfn convert_long3_rtn(uint3);
+long3 __ovld __cnfn convert_long3_sat_rtn(uint3);
+long3 __ovld __cnfn convert_long3(uint3);
+long3 __ovld __cnfn convert_long3_sat(uint3);
+long3 __ovld __cnfn convert_long3_rte(long3);
+long3 __ovld __cnfn convert_long3_sat_rte(long3);
+long3 __ovld __cnfn convert_long3_rtz(long3);
+long3 __ovld __cnfn convert_long3_sat_rtz(long3);
+long3 __ovld __cnfn convert_long3_rtp(long3);
+long3 __ovld __cnfn convert_long3_sat_rtp(long3);
+long3 __ovld __cnfn convert_long3_rtn(long3);
+long3 __ovld __cnfn convert_long3_sat_rtn(long3);
+long3 __ovld __cnfn convert_long3(long3);
+long3 __ovld __cnfn convert_long3_sat(long3);
+long3 __ovld __cnfn convert_long3_rte(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rte(ulong3);
+long3 __ovld __cnfn convert_long3_rtz(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtz(ulong3);
+long3 __ovld __cnfn convert_long3_rtp(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtp(ulong3);
+long3 __ovld __cnfn convert_long3_rtn(ulong3);
+long3 __ovld __cnfn convert_long3_sat_rtn(ulong3);
+long3 __ovld __cnfn convert_long3(ulong3);
+long3 __ovld __cnfn convert_long3_sat(ulong3);
+long3 __ovld __cnfn convert_long3_rte(float3);
+long3 __ovld __cnfn convert_long3_sat_rte(float3);
+long3 __ovld __cnfn convert_long3_rtz(float3);
+long3 __ovld __cnfn convert_long3_sat_rtz(float3);
+long3 __ovld __cnfn convert_long3_rtp(float3);
+long3 __ovld __cnfn convert_long3_sat_rtp(float3);
+long3 __ovld __cnfn convert_long3_rtn(float3);
+long3 __ovld __cnfn convert_long3_sat_rtn(float3);
+long3 __ovld __cnfn convert_long3(float3);
+long3 __ovld __cnfn convert_long3_sat(float3);
+ulong3 __ovld __cnfn convert_ulong3_rte(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(char3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(char3);
+ulong3 __ovld __cnfn convert_ulong3(char3);
+ulong3 __ovld __cnfn convert_ulong3_sat(char3);
+ulong3 __ovld __cnfn convert_ulong3_rte(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uchar3);
+ulong3 __ovld __cnfn convert_ulong3(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_sat(uchar3);
+ulong3 __ovld __cnfn convert_ulong3_rte(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(short3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(short3);
+ulong3 __ovld __cnfn convert_ulong3(short3);
+ulong3 __ovld __cnfn convert_ulong3_sat(short3);
+ulong3 __ovld __cnfn convert_ulong3_rte(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ushort3);
+ulong3 __ovld __cnfn convert_ulong3(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_sat(ushort3);
+ulong3 __ovld __cnfn convert_ulong3_rte(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(int3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(int3);
+ulong3 __ovld __cnfn convert_ulong3(int3);
+ulong3 __ovld __cnfn convert_ulong3_sat(int3);
+ulong3 __ovld __cnfn convert_ulong3_rte(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(uint3);
+ulong3 __ovld __cnfn convert_ulong3(uint3);
+ulong3 __ovld __cnfn convert_ulong3_sat(uint3);
+ulong3 __ovld __cnfn convert_ulong3_rte(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(long3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(long3);
+ulong3 __ovld __cnfn convert_ulong3(long3);
+ulong3 __ovld __cnfn convert_ulong3_sat(long3);
+ulong3 __ovld __cnfn convert_ulong3_rte(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(ulong3);
+ulong3 __ovld __cnfn convert_ulong3(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_sat(ulong3);
+ulong3 __ovld __cnfn convert_ulong3_rte(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(float3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(float3);
+ulong3 __ovld __cnfn convert_ulong3(float3);
+ulong3 __ovld __cnfn convert_ulong3_sat(float3);
+float3 __ovld __cnfn convert_float3_rte(char3);
+float3 __ovld __cnfn convert_float3_rtz(char3);
+float3 __ovld __cnfn convert_float3_rtp(char3);
+float3 __ovld __cnfn convert_float3_rtn(char3);
+float3 __ovld __cnfn convert_float3(char3);
+float3 __ovld __cnfn convert_float3_rte(uchar3);
+float3 __ovld __cnfn convert_float3_rtz(uchar3);
+float3 __ovld __cnfn convert_float3_rtp(uchar3);
+float3 __ovld __cnfn convert_float3_rtn(uchar3);
+float3 __ovld __cnfn convert_float3(uchar3);
+float3 __ovld __cnfn convert_float3_rte(short3);
+float3 __ovld __cnfn convert_float3_rtz(short3);
+float3 __ovld __cnfn convert_float3_rtp(short3);
+float3 __ovld __cnfn convert_float3_rtn(short3);
+float3 __ovld __cnfn convert_float3(short3);
+float3 __ovld __cnfn convert_float3_rte(ushort3);
+float3 __ovld __cnfn convert_float3_rtz(ushort3);
+float3 __ovld __cnfn convert_float3_rtp(ushort3);
+float3 __ovld __cnfn convert_float3_rtn(ushort3);
+float3 __ovld __cnfn convert_float3(ushort3);
+float3 __ovld __cnfn convert_float3_rte(int3);
+float3 __ovld __cnfn convert_float3_rtz(int3);
+float3 __ovld __cnfn convert_float3_rtp(int3);
+float3 __ovld __cnfn convert_float3_rtn(int3);
+float3 __ovld __cnfn convert_float3(int3);
+float3 __ovld __cnfn convert_float3_rte(uint3);
+float3 __ovld __cnfn convert_float3_rtz(uint3);
+float3 __ovld __cnfn convert_float3_rtp(uint3);
+float3 __ovld __cnfn convert_float3_rtn(uint3);
+float3 __ovld __cnfn convert_float3(uint3);
+float3 __ovld __cnfn convert_float3_rte(long3);
+float3 __ovld __cnfn convert_float3_rtz(long3);
+float3 __ovld __cnfn convert_float3_rtp(long3);
+float3 __ovld __cnfn convert_float3_rtn(long3);
+float3 __ovld __cnfn convert_float3(long3);
+float3 __ovld __cnfn convert_float3_rte(ulong3);
+float3 __ovld __cnfn convert_float3_rtz(ulong3);
+float3 __ovld __cnfn convert_float3_rtp(ulong3);
+float3 __ovld __cnfn convert_float3_rtn(ulong3);
+float3 __ovld __cnfn convert_float3(ulong3);
+float3 __ovld __cnfn convert_float3_rte(float3);
+float3 __ovld __cnfn convert_float3_rtz(float3);
+float3 __ovld __cnfn convert_float3_rtp(float3);
+float3 __ovld __cnfn convert_float3_rtn(float3);
+float3 __ovld __cnfn convert_float3(float3);
+char4 __ovld __cnfn convert_char4_rte(char4);
+char4 __ovld __cnfn convert_char4_sat_rte(char4);
+char4 __ovld __cnfn convert_char4_rtz(char4);
+char4 __ovld __cnfn convert_char4_sat_rtz(char4);
+char4 __ovld __cnfn convert_char4_rtp(char4);
+char4 __ovld __cnfn convert_char4_sat_rtp(char4);
+char4 __ovld __cnfn convert_char4_rtn(char4);
+char4 __ovld __cnfn convert_char4_sat_rtn(char4);
+char4 __ovld __cnfn convert_char4(char4);
+char4 __ovld __cnfn convert_char4_sat(char4);
+char4 __ovld __cnfn convert_char4_rte(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rte(uchar4);
+char4 __ovld __cnfn convert_char4_rtz(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtz(uchar4);
+char4 __ovld __cnfn convert_char4_rtp(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtp(uchar4);
+char4 __ovld __cnfn convert_char4_rtn(uchar4);
+char4 __ovld __cnfn convert_char4_sat_rtn(uchar4);
+char4 __ovld __cnfn convert_char4(uchar4);
+char4 __ovld __cnfn convert_char4_sat(uchar4);
+char4 __ovld __cnfn convert_char4_rte(short4);
+char4 __ovld __cnfn convert_char4_sat_rte(short4);
+char4 __ovld __cnfn convert_char4_rtz(short4);
+char4 __ovld __cnfn convert_char4_sat_rtz(short4);
+char4 __ovld __cnfn convert_char4_rtp(short4);
+char4 __ovld __cnfn convert_char4_sat_rtp(short4);
+char4 __ovld __cnfn convert_char4_rtn(short4);
+char4 __ovld __cnfn convert_char4_sat_rtn(short4);
+char4 __ovld __cnfn convert_char4(short4);
+char4 __ovld __cnfn convert_char4_sat(short4);
+char4 __ovld __cnfn convert_char4_rte(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rte(ushort4);
+char4 __ovld __cnfn convert_char4_rtz(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtz(ushort4);
+char4 __ovld __cnfn convert_char4_rtp(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtp(ushort4);
+char4 __ovld __cnfn convert_char4_rtn(ushort4);
+char4 __ovld __cnfn convert_char4_sat_rtn(ushort4);
+char4 __ovld __cnfn convert_char4(ushort4);
+char4 __ovld __cnfn convert_char4_sat(ushort4);
+char4 __ovld __cnfn convert_char4_rte(int4);
+char4 __ovld __cnfn convert_char4_sat_rte(int4);
+char4 __ovld __cnfn convert_char4_rtz(int4);
+char4 __ovld __cnfn convert_char4_sat_rtz(int4);
+char4 __ovld __cnfn convert_char4_rtp(int4);
+char4 __ovld __cnfn convert_char4_sat_rtp(int4);
+char4 __ovld __cnfn convert_char4_rtn(int4);
+char4 __ovld __cnfn convert_char4_sat_rtn(int4);
+char4 __ovld __cnfn convert_char4(int4);
+char4 __ovld __cnfn convert_char4_sat(int4);
+char4 __ovld __cnfn convert_char4_rte(uint4);
+char4 __ovld __cnfn convert_char4_sat_rte(uint4);
+char4 __ovld __cnfn convert_char4_rtz(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtz(uint4);
+char4 __ovld __cnfn convert_char4_rtp(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtp(uint4);
+char4 __ovld __cnfn convert_char4_rtn(uint4);
+char4 __ovld __cnfn convert_char4_sat_rtn(uint4);
+char4 __ovld __cnfn convert_char4(uint4);
+char4 __ovld __cnfn convert_char4_sat(uint4);
+char4 __ovld __cnfn convert_char4_rte(long4);
+char4 __ovld __cnfn convert_char4_sat_rte(long4);
+char4 __ovld __cnfn convert_char4_rtz(long4);
+char4 __ovld __cnfn convert_char4_sat_rtz(long4);
+char4 __ovld __cnfn convert_char4_rtp(long4);
+char4 __ovld __cnfn convert_char4_sat_rtp(long4);
+char4 __ovld __cnfn convert_char4_rtn(long4);
+char4 __ovld __cnfn convert_char4_sat_rtn(long4);
+char4 __ovld __cnfn convert_char4(long4);
+char4 __ovld __cnfn convert_char4_sat(long4);
+char4 __ovld __cnfn convert_char4_rte(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rte(ulong4);
+char4 __ovld __cnfn convert_char4_rtz(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtz(ulong4);
+char4 __ovld __cnfn convert_char4_rtp(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtp(ulong4);
+char4 __ovld __cnfn convert_char4_rtn(ulong4);
+char4 __ovld __cnfn convert_char4_sat_rtn(ulong4);
+char4 __ovld __cnfn convert_char4(ulong4);
+char4 __ovld __cnfn convert_char4_sat(ulong4);
+char4 __ovld __cnfn convert_char4_rte(float4);
+char4 __ovld __cnfn convert_char4_sat_rte(float4);
+char4 __ovld __cnfn convert_char4_rtz(float4);
+char4 __ovld __cnfn convert_char4_sat_rtz(float4);
+char4 __ovld __cnfn convert_char4_rtp(float4);
+char4 __ovld __cnfn convert_char4_sat_rtp(float4);
+char4 __ovld __cnfn convert_char4_rtn(float4);
+char4 __ovld __cnfn convert_char4_sat_rtn(float4);
+char4 __ovld __cnfn convert_char4(float4);
+char4 __ovld __cnfn convert_char4_sat(float4);
+uchar4 __ovld __cnfn convert_uchar4_rte(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(char4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(char4);
+uchar4 __ovld __cnfn convert_uchar4(char4);
+uchar4 __ovld __cnfn convert_uchar4_sat(char4);
+uchar4 __ovld __cnfn convert_uchar4_rte(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uchar4);
+uchar4 __ovld __cnfn convert_uchar4(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_sat(uchar4);
+uchar4 __ovld __cnfn convert_uchar4_rte(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(short4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(short4);
+uchar4 __ovld __cnfn convert_uchar4(short4);
+uchar4 __ovld __cnfn convert_uchar4_sat(short4);
+uchar4 __ovld __cnfn convert_uchar4_rte(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ushort4);
+uchar4 __ovld __cnfn convert_uchar4(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_sat(ushort4);
+uchar4 __ovld __cnfn convert_uchar4_rte(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(int4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(int4);
+uchar4 __ovld __cnfn convert_uchar4(int4);
+uchar4 __ovld __cnfn convert_uchar4_sat(int4);
+uchar4 __ovld __cnfn convert_uchar4_rte(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(uint4);
+uchar4 __ovld __cnfn convert_uchar4(uint4);
+uchar4 __ovld __cnfn convert_uchar4_sat(uint4);
+uchar4 __ovld __cnfn convert_uchar4_rte(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(long4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(long4);
+uchar4 __ovld __cnfn convert_uchar4(long4);
+uchar4 __ovld __cnfn convert_uchar4_sat(long4);
+uchar4 __ovld __cnfn convert_uchar4_rte(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(ulong4);
+uchar4 __ovld __cnfn convert_uchar4(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_sat(ulong4);
+uchar4 __ovld __cnfn convert_uchar4_rte(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(float4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(float4);
+uchar4 __ovld __cnfn convert_uchar4(float4);
+uchar4 __ovld __cnfn convert_uchar4_sat(float4);
+short4 __ovld __cnfn convert_short4_rte(char4);
+short4 __ovld __cnfn convert_short4_sat_rte(char4);
+short4 __ovld __cnfn convert_short4_rtz(char4);
+short4 __ovld __cnfn convert_short4_sat_rtz(char4);
+short4 __ovld __cnfn convert_short4_rtp(char4);
+short4 __ovld __cnfn convert_short4_sat_rtp(char4);
+short4 __ovld __cnfn convert_short4_rtn(char4);
+short4 __ovld __cnfn convert_short4_sat_rtn(char4);
+short4 __ovld __cnfn convert_short4(char4);
+short4 __ovld __cnfn convert_short4_sat(char4);
+short4 __ovld __cnfn convert_short4_rte(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rte(uchar4);
+short4 __ovld __cnfn convert_short4_rtz(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtz(uchar4);
+short4 __ovld __cnfn convert_short4_rtp(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtp(uchar4);
+short4 __ovld __cnfn convert_short4_rtn(uchar4);
+short4 __ovld __cnfn convert_short4_sat_rtn(uchar4);
+short4 __ovld __cnfn convert_short4(uchar4);
+short4 __ovld __cnfn convert_short4_sat(uchar4);
+short4 __ovld __cnfn convert_short4_rte(short4);
+short4 __ovld __cnfn convert_short4_sat_rte(short4);
+short4 __ovld __cnfn convert_short4_rtz(short4);
+short4 __ovld __cnfn convert_short4_sat_rtz(short4);
+short4 __ovld __cnfn convert_short4_rtp(short4);
+short4 __ovld __cnfn convert_short4_sat_rtp(short4);
+short4 __ovld __cnfn convert_short4_rtn(short4);
+short4 __ovld __cnfn convert_short4_sat_rtn(short4);
+short4 __ovld __cnfn convert_short4(short4);
+short4 __ovld __cnfn convert_short4_sat(short4);
+short4 __ovld __cnfn convert_short4_rte(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rte(ushort4);
+short4 __ovld __cnfn convert_short4_rtz(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtz(ushort4);
+short4 __ovld __cnfn convert_short4_rtp(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtp(ushort4);
+short4 __ovld __cnfn convert_short4_rtn(ushort4);
+short4 __ovld __cnfn convert_short4_sat_rtn(ushort4);
+short4 __ovld __cnfn convert_short4(ushort4);
+short4 __ovld __cnfn convert_short4_sat(ushort4);
+short4 __ovld __cnfn convert_short4_rte(int4);
+short4 __ovld __cnfn convert_short4_sat_rte(int4);
+short4 __ovld __cnfn convert_short4_rtz(int4);
+short4 __ovld __cnfn convert_short4_sat_rtz(int4);
+short4 __ovld __cnfn convert_short4_rtp(int4);
+short4 __ovld __cnfn convert_short4_sat_rtp(int4);
+short4 __ovld __cnfn convert_short4_rtn(int4);
+short4 __ovld __cnfn convert_short4_sat_rtn(int4);
+short4 __ovld __cnfn convert_short4(int4);
+short4 __ovld __cnfn convert_short4_sat(int4);
+short4 __ovld __cnfn convert_short4_rte(uint4);
+short4 __ovld __cnfn convert_short4_sat_rte(uint4);
+short4 __ovld __cnfn convert_short4_rtz(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtz(uint4);
+short4 __ovld __cnfn convert_short4_rtp(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtp(uint4);
+short4 __ovld __cnfn convert_short4_rtn(uint4);
+short4 __ovld __cnfn convert_short4_sat_rtn(uint4);
+short4 __ovld __cnfn convert_short4(uint4);
+short4 __ovld __cnfn convert_short4_sat(uint4);
+short4 __ovld __cnfn convert_short4_rte(long4);
+short4 __ovld __cnfn convert_short4_sat_rte(long4);
+short4 __ovld __cnfn convert_short4_rtz(long4);
+short4 __ovld __cnfn convert_short4_sat_rtz(long4);
+short4 __ovld __cnfn convert_short4_rtp(long4);
+short4 __ovld __cnfn convert_short4_sat_rtp(long4);
+short4 __ovld __cnfn convert_short4_rtn(long4);
+short4 __ovld __cnfn convert_short4_sat_rtn(long4);
+short4 __ovld __cnfn convert_short4(long4);
+short4 __ovld __cnfn convert_short4_sat(long4);
+short4 __ovld __cnfn convert_short4_rte(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rte(ulong4);
+short4 __ovld __cnfn convert_short4_rtz(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtz(ulong4);
+short4 __ovld __cnfn convert_short4_rtp(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtp(ulong4);
+short4 __ovld __cnfn convert_short4_rtn(ulong4);
+short4 __ovld __cnfn convert_short4_sat_rtn(ulong4);
+short4 __ovld __cnfn convert_short4(ulong4);
+short4 __ovld __cnfn convert_short4_sat(ulong4);
+short4 __ovld __cnfn convert_short4_rte(float4);
+short4 __ovld __cnfn convert_short4_sat_rte(float4);
+short4 __ovld __cnfn convert_short4_rtz(float4);
+short4 __ovld __cnfn convert_short4_sat_rtz(float4);
+short4 __ovld __cnfn convert_short4_rtp(float4);
+short4 __ovld __cnfn convert_short4_sat_rtp(float4);
+short4 __ovld __cnfn convert_short4_rtn(float4);
+short4 __ovld __cnfn convert_short4_sat_rtn(float4);
+short4 __ovld __cnfn convert_short4(float4);
+short4 __ovld __cnfn convert_short4_sat(float4);
+ushort4 __ovld __cnfn convert_ushort4_rte(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(char4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(char4);
+ushort4 __ovld __cnfn convert_ushort4(char4);
+ushort4 __ovld __cnfn convert_ushort4_sat(char4);
+ushort4 __ovld __cnfn convert_ushort4_rte(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uchar4);
+ushort4 __ovld __cnfn convert_ushort4(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_sat(uchar4);
+ushort4 __ovld __cnfn convert_ushort4_rte(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(short4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(short4);
+ushort4 __ovld __cnfn convert_ushort4(short4);
+ushort4 __ovld __cnfn convert_ushort4_sat(short4);
+ushort4 __ovld __cnfn convert_ushort4_rte(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ushort4);
+ushort4 __ovld __cnfn convert_ushort4(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_sat(ushort4);
+ushort4 __ovld __cnfn convert_ushort4_rte(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(int4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(int4);
+ushort4 __ovld __cnfn convert_ushort4(int4);
+ushort4 __ovld __cnfn convert_ushort4_sat(int4);
+ushort4 __ovld __cnfn convert_ushort4_rte(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(uint4);
+ushort4 __ovld __cnfn convert_ushort4(uint4);
+ushort4 __ovld __cnfn convert_ushort4_sat(uint4);
+ushort4 __ovld __cnfn convert_ushort4_rte(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(long4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(long4);
+ushort4 __ovld __cnfn convert_ushort4(long4);
+ushort4 __ovld __cnfn convert_ushort4_sat(long4);
+ushort4 __ovld __cnfn convert_ushort4_rte(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(ulong4);
+ushort4 __ovld __cnfn convert_ushort4(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_sat(ulong4);
+ushort4 __ovld __cnfn convert_ushort4_rte(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(float4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(float4);
+ushort4 __ovld __cnfn convert_ushort4(float4);
+ushort4 __ovld __cnfn convert_ushort4_sat(float4);
+int4 __ovld __cnfn convert_int4_rte(char4);
+int4 __ovld __cnfn convert_int4_sat_rte(char4);
+int4 __ovld __cnfn convert_int4_rtz(char4);
+int4 __ovld __cnfn convert_int4_sat_rtz(char4);
+int4 __ovld __cnfn convert_int4_rtp(char4);
+int4 __ovld __cnfn convert_int4_sat_rtp(char4);
+int4 __ovld __cnfn convert_int4_rtn(char4);
+int4 __ovld __cnfn convert_int4_sat_rtn(char4);
+int4 __ovld __cnfn convert_int4(char4);
+int4 __ovld __cnfn convert_int4_sat(char4);
+int4 __ovld __cnfn convert_int4_rte(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rte(uchar4);
+int4 __ovld __cnfn convert_int4_rtz(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtz(uchar4);
+int4 __ovld __cnfn convert_int4_rtp(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtp(uchar4);
+int4 __ovld __cnfn convert_int4_rtn(uchar4);
+int4 __ovld __cnfn convert_int4_sat_rtn(uchar4);
+int4 __ovld __cnfn convert_int4(uchar4);
+int4 __ovld __cnfn convert_int4_sat(uchar4);
+int4 __ovld __cnfn convert_int4_rte(short4);
+int4 __ovld __cnfn convert_int4_sat_rte(short4);
+int4 __ovld __cnfn convert_int4_rtz(short4);
+int4 __ovld __cnfn convert_int4_sat_rtz(short4);
+int4 __ovld __cnfn convert_int4_rtp(short4);
+int4 __ovld __cnfn convert_int4_sat_rtp(short4);
+int4 __ovld __cnfn convert_int4_rtn(short4);
+int4 __ovld __cnfn convert_int4_sat_rtn(short4);
+int4 __ovld __cnfn convert_int4(short4);
+int4 __ovld __cnfn convert_int4_sat(short4);
+int4 __ovld __cnfn convert_int4_rte(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rte(ushort4);
+int4 __ovld __cnfn convert_int4_rtz(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtz(ushort4);
+int4 __ovld __cnfn convert_int4_rtp(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtp(ushort4);
+int4 __ovld __cnfn convert_int4_rtn(ushort4);
+int4 __ovld __cnfn convert_int4_sat_rtn(ushort4);
+int4 __ovld __cnfn convert_int4(ushort4);
+int4 __ovld __cnfn convert_int4_sat(ushort4);
+int4 __ovld __cnfn convert_int4_rte(int4);
+int4 __ovld __cnfn convert_int4_sat_rte(int4);
+int4 __ovld __cnfn convert_int4_rtz(int4);
+int4 __ovld __cnfn convert_int4_sat_rtz(int4);
+int4 __ovld __cnfn convert_int4_rtp(int4);
+int4 __ovld __cnfn convert_int4_sat_rtp(int4);
+int4 __ovld __cnfn convert_int4_rtn(int4);
+int4 __ovld __cnfn convert_int4_sat_rtn(int4);
+int4 __ovld __cnfn convert_int4(int4);
+int4 __ovld __cnfn convert_int4_sat(int4);
+int4 __ovld __cnfn convert_int4_rte(uint4);
+int4 __ovld __cnfn convert_int4_sat_rte(uint4);
+int4 __ovld __cnfn convert_int4_rtz(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtz(uint4);
+int4 __ovld __cnfn convert_int4_rtp(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtp(uint4);
+int4 __ovld __cnfn convert_int4_rtn(uint4);
+int4 __ovld __cnfn convert_int4_sat_rtn(uint4);
+int4 __ovld __cnfn convert_int4(uint4);
+int4 __ovld __cnfn convert_int4_sat(uint4);
+int4 __ovld __cnfn convert_int4_rte(long4);
+int4 __ovld __cnfn convert_int4_sat_rte(long4);
+int4 __ovld __cnfn convert_int4_rtz(long4);
+int4 __ovld __cnfn convert_int4_sat_rtz(long4);
+int4 __ovld __cnfn convert_int4_rtp(long4);
+int4 __ovld __cnfn convert_int4_sat_rtp(long4);
+int4 __ovld __cnfn convert_int4_rtn(long4);
+int4 __ovld __cnfn convert_int4_sat_rtn(long4);
+int4 __ovld __cnfn convert_int4(long4);
+int4 __ovld __cnfn convert_int4_sat(long4);
+int4 __ovld __cnfn convert_int4_rte(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rte(ulong4);
+int4 __ovld __cnfn convert_int4_rtz(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtz(ulong4);
+int4 __ovld __cnfn convert_int4_rtp(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtp(ulong4);
+int4 __ovld __cnfn convert_int4_rtn(ulong4);
+int4 __ovld __cnfn convert_int4_sat_rtn(ulong4);
+int4 __ovld __cnfn convert_int4(ulong4);
+int4 __ovld __cnfn convert_int4_sat(ulong4);
+int4 __ovld __cnfn convert_int4_rte(float4);
+int4 __ovld __cnfn convert_int4_sat_rte(float4);
+int4 __ovld __cnfn convert_int4_rtz(float4);
+int4 __ovld __cnfn convert_int4_sat_rtz(float4);
+int4 __ovld __cnfn convert_int4_rtp(float4);
+int4 __ovld __cnfn convert_int4_sat_rtp(float4);
+int4 __ovld __cnfn convert_int4_rtn(float4);
+int4 __ovld __cnfn convert_int4_sat_rtn(float4);
+int4 __ovld __cnfn convert_int4(float4);
+int4 __ovld __cnfn convert_int4_sat(float4);
+uint4 __ovld __cnfn convert_uint4_rte(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(char4);
+uint4 __ovld __cnfn convert_uint4_rtz(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(char4);
+uint4 __ovld __cnfn convert_uint4_rtp(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(char4);
+uint4 __ovld __cnfn convert_uint4_rtn(char4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(char4);
+uint4 __ovld __cnfn convert_uint4(char4);
+uint4 __ovld __cnfn convert_uint4_sat(char4);
+uint4 __ovld __cnfn convert_uint4_rte(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtz(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtp(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(uchar4);
+uint4 __ovld __cnfn convert_uint4_rtn(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(uchar4);
+uint4 __ovld __cnfn convert_uint4(uchar4);
+uint4 __ovld __cnfn convert_uint4_sat(uchar4);
+uint4 __ovld __cnfn convert_uint4_rte(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(short4);
+uint4 __ovld __cnfn convert_uint4_rtz(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(short4);
+uint4 __ovld __cnfn convert_uint4_rtp(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(short4);
+uint4 __ovld __cnfn convert_uint4_rtn(short4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(short4);
+uint4 __ovld __cnfn convert_uint4(short4);
+uint4 __ovld __cnfn convert_uint4_sat(short4);
+uint4 __ovld __cnfn convert_uint4_rte(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtz(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtp(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(ushort4);
+uint4 __ovld __cnfn convert_uint4_rtn(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(ushort4);
+uint4 __ovld __cnfn convert_uint4(ushort4);
+uint4 __ovld __cnfn convert_uint4_sat(ushort4);
+uint4 __ovld __cnfn convert_uint4_rte(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(int4);
+uint4 __ovld __cnfn convert_uint4_rtz(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(int4);
+uint4 __ovld __cnfn convert_uint4_rtp(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(int4);
+uint4 __ovld __cnfn convert_uint4_rtn(int4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(int4);
+uint4 __ovld __cnfn convert_uint4(int4);
+uint4 __ovld __cnfn convert_uint4_sat(int4);
+uint4 __ovld __cnfn convert_uint4_rte(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(uint4);
+uint4 __ovld __cnfn convert_uint4_rtz(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(uint4);
+uint4 __ovld __cnfn convert_uint4_rtp(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(uint4);
+uint4 __ovld __cnfn convert_uint4_rtn(uint4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(uint4);
+uint4 __ovld __cnfn convert_uint4(uint4);
+uint4 __ovld __cnfn convert_uint4_sat(uint4);
+uint4 __ovld __cnfn convert_uint4_rte(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(long4);
+uint4 __ovld __cnfn convert_uint4_rtz(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(long4);
+uint4 __ovld __cnfn convert_uint4_rtp(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(long4);
+uint4 __ovld __cnfn convert_uint4_rtn(long4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(long4);
+uint4 __ovld __cnfn convert_uint4(long4);
+uint4 __ovld __cnfn convert_uint4_sat(long4);
+uint4 __ovld __cnfn convert_uint4_rte(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtz(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtp(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(ulong4);
+uint4 __ovld __cnfn convert_uint4_rtn(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(ulong4);
+uint4 __ovld __cnfn convert_uint4(ulong4);
+uint4 __ovld __cnfn convert_uint4_sat(ulong4);
+uint4 __ovld __cnfn convert_uint4_rte(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(float4);
+uint4 __ovld __cnfn convert_uint4_rtz(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(float4);
+uint4 __ovld __cnfn convert_uint4_rtp(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(float4);
+uint4 __ovld __cnfn convert_uint4_rtn(float4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(float4);
+uint4 __ovld __cnfn convert_uint4(float4);
+uint4 __ovld __cnfn convert_uint4_sat(float4);
+long4 __ovld __cnfn convert_long4_rte(char4);
+long4 __ovld __cnfn convert_long4_sat_rte(char4);
+long4 __ovld __cnfn convert_long4_rtz(char4);
+long4 __ovld __cnfn convert_long4_sat_rtz(char4);
+long4 __ovld __cnfn convert_long4_rtp(char4);
+long4 __ovld __cnfn convert_long4_sat_rtp(char4);
+long4 __ovld __cnfn convert_long4_rtn(char4);
+long4 __ovld __cnfn convert_long4_sat_rtn(char4);
+long4 __ovld __cnfn convert_long4(char4);
+long4 __ovld __cnfn convert_long4_sat(char4);
+long4 __ovld __cnfn convert_long4_rte(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rte(uchar4);
+long4 __ovld __cnfn convert_long4_rtz(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtz(uchar4);
+long4 __ovld __cnfn convert_long4_rtp(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtp(uchar4);
+long4 __ovld __cnfn convert_long4_rtn(uchar4);
+long4 __ovld __cnfn convert_long4_sat_rtn(uchar4);
+long4 __ovld __cnfn convert_long4(uchar4);
+long4 __ovld __cnfn convert_long4_sat(uchar4);
+long4 __ovld __cnfn convert_long4_rte(short4);
+long4 __ovld __cnfn convert_long4_sat_rte(short4);
+long4 __ovld __cnfn convert_long4_rtz(short4);
+long4 __ovld __cnfn convert_long4_sat_rtz(short4);
+long4 __ovld __cnfn convert_long4_rtp(short4);
+long4 __ovld __cnfn convert_long4_sat_rtp(short4);
+long4 __ovld __cnfn convert_long4_rtn(short4);
+long4 __ovld __cnfn convert_long4_sat_rtn(short4);
+long4 __ovld __cnfn convert_long4(short4);
+long4 __ovld __cnfn convert_long4_sat(short4);
+long4 __ovld __cnfn convert_long4_rte(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rte(ushort4);
+long4 __ovld __cnfn convert_long4_rtz(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtz(ushort4);
+long4 __ovld __cnfn convert_long4_rtp(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtp(ushort4);
+long4 __ovld __cnfn convert_long4_rtn(ushort4);
+long4 __ovld __cnfn convert_long4_sat_rtn(ushort4);
+long4 __ovld __cnfn convert_long4(ushort4);
+long4 __ovld __cnfn convert_long4_sat(ushort4);
+long4 __ovld __cnfn convert_long4_rte(int4);
+long4 __ovld __cnfn convert_long4_sat_rte(int4);
+long4 __ovld __cnfn convert_long4_rtz(int4);
+long4 __ovld __cnfn convert_long4_sat_rtz(int4);
+long4 __ovld __cnfn convert_long4_rtp(int4);
+long4 __ovld __cnfn convert_long4_sat_rtp(int4);
+long4 __ovld __cnfn convert_long4_rtn(int4);
+long4 __ovld __cnfn convert_long4_sat_rtn(int4);
+long4 __ovld __cnfn convert_long4(int4);
+long4 __ovld __cnfn convert_long4_sat(int4);
+long4 __ovld __cnfn convert_long4_rte(uint4);
+long4 __ovld __cnfn convert_long4_sat_rte(uint4);
+long4 __ovld __cnfn convert_long4_rtz(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtz(uint4);
+long4 __ovld __cnfn convert_long4_rtp(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtp(uint4);
+long4 __ovld __cnfn convert_long4_rtn(uint4);
+long4 __ovld __cnfn convert_long4_sat_rtn(uint4);
+long4 __ovld __cnfn convert_long4(uint4);
+long4 __ovld __cnfn convert_long4_sat(uint4);
+long4 __ovld __cnfn convert_long4_rte(long4);
+long4 __ovld __cnfn convert_long4_sat_rte(long4);
+long4 __ovld __cnfn convert_long4_rtz(long4);
+long4 __ovld __cnfn convert_long4_sat_rtz(long4);
+long4 __ovld __cnfn convert_long4_rtp(long4);
+long4 __ovld __cnfn convert_long4_sat_rtp(long4);
+long4 __ovld __cnfn convert_long4_rtn(long4);
+long4 __ovld __cnfn convert_long4_sat_rtn(long4);
+long4 __ovld __cnfn convert_long4(long4);
+long4 __ovld __cnfn convert_long4_sat(long4);
+long4 __ovld __cnfn convert_long4_rte(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rte(ulong4);
+long4 __ovld __cnfn convert_long4_rtz(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtz(ulong4);
+long4 __ovld __cnfn convert_long4_rtp(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtp(ulong4);
+long4 __ovld __cnfn convert_long4_rtn(ulong4);
+long4 __ovld __cnfn convert_long4_sat_rtn(ulong4);
+long4 __ovld __cnfn convert_long4(ulong4);
+long4 __ovld __cnfn convert_long4_sat(ulong4);
+long4 __ovld __cnfn convert_long4_rte(float4);
+long4 __ovld __cnfn convert_long4_sat_rte(float4);
+long4 __ovld __cnfn convert_long4_rtz(float4);
+long4 __ovld __cnfn convert_long4_sat_rtz(float4);
+long4 __ovld __cnfn convert_long4_rtp(float4);
+long4 __ovld __cnfn convert_long4_sat_rtp(float4);
+long4 __ovld __cnfn convert_long4_rtn(float4);
+long4 __ovld __cnfn convert_long4_sat_rtn(float4);
+long4 __ovld __cnfn convert_long4(float4);
+long4 __ovld __cnfn convert_long4_sat(float4);
+ulong4 __ovld __cnfn convert_ulong4_rte(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(char4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(char4);
+ulong4 __ovld __cnfn convert_ulong4(char4);
+ulong4 __ovld __cnfn convert_ulong4_sat(char4);
+ulong4 __ovld __cnfn convert_ulong4_rte(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uchar4);
+ulong4 __ovld __cnfn convert_ulong4(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_sat(uchar4);
+ulong4 __ovld __cnfn convert_ulong4_rte(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(short4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(short4);
+ulong4 __ovld __cnfn convert_ulong4(short4);
+ulong4 __ovld __cnfn convert_ulong4_sat(short4);
+ulong4 __ovld __cnfn convert_ulong4_rte(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ushort4);
+ulong4 __ovld __cnfn convert_ulong4(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_sat(ushort4);
+ulong4 __ovld __cnfn convert_ulong4_rte(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(int4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(int4);
+ulong4 __ovld __cnfn convert_ulong4(int4);
+ulong4 __ovld __cnfn convert_ulong4_sat(int4);
+ulong4 __ovld __cnfn convert_ulong4_rte(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(uint4);
+ulong4 __ovld __cnfn convert_ulong4(uint4);
+ulong4 __ovld __cnfn convert_ulong4_sat(uint4);
+ulong4 __ovld __cnfn convert_ulong4_rte(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(long4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(long4);
+ulong4 __ovld __cnfn convert_ulong4(long4);
+ulong4 __ovld __cnfn convert_ulong4_sat(long4);
+ulong4 __ovld __cnfn convert_ulong4_rte(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(ulong4);
+ulong4 __ovld __cnfn convert_ulong4(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_sat(ulong4);
+ulong4 __ovld __cnfn convert_ulong4_rte(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(float4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(float4);
+ulong4 __ovld __cnfn convert_ulong4(float4);
+ulong4 __ovld __cnfn convert_ulong4_sat(float4);
+float4 __ovld __cnfn convert_float4_rte(char4);
+float4 __ovld __cnfn convert_float4_rtz(char4);
+float4 __ovld __cnfn convert_float4_rtp(char4);
+float4 __ovld __cnfn convert_float4_rtn(char4);
+float4 __ovld __cnfn convert_float4(char4);
+float4 __ovld __cnfn convert_float4_rte(uchar4);
+float4 __ovld __cnfn convert_float4_rtz(uchar4);
+float4 __ovld __cnfn convert_float4_rtp(uchar4);
+float4 __ovld __cnfn convert_float4_rtn(uchar4);
+float4 __ovld __cnfn convert_float4(uchar4);
+float4 __ovld __cnfn convert_float4_rte(short4);
+float4 __ovld __cnfn convert_float4_rtz(short4);
+float4 __ovld __cnfn convert_float4_rtp(short4);
+float4 __ovld __cnfn convert_float4_rtn(short4);
+float4 __ovld __cnfn convert_float4(short4);
+float4 __ovld __cnfn convert_float4_rte(ushort4);
+float4 __ovld __cnfn convert_float4_rtz(ushort4);
+float4 __ovld __cnfn convert_float4_rtp(ushort4);
+float4 __ovld __cnfn convert_float4_rtn(ushort4);
+float4 __ovld __cnfn convert_float4(ushort4);
+float4 __ovld __cnfn convert_float4_rte(int4);
+float4 __ovld __cnfn convert_float4_rtz(int4);
+float4 __ovld __cnfn convert_float4_rtp(int4);
+float4 __ovld __cnfn convert_float4_rtn(int4);
+float4 __ovld __cnfn convert_float4(int4);
+float4 __ovld __cnfn convert_float4_rte(uint4);
+float4 __ovld __cnfn convert_float4_rtz(uint4);
+float4 __ovld __cnfn convert_float4_rtp(uint4);
+float4 __ovld __cnfn convert_float4_rtn(uint4);
+float4 __ovld __cnfn convert_float4(uint4);
+float4 __ovld __cnfn convert_float4_rte(long4);
+float4 __ovld __cnfn convert_float4_rtz(long4);
+float4 __ovld __cnfn convert_float4_rtp(long4);
+float4 __ovld __cnfn convert_float4_rtn(long4);
+float4 __ovld __cnfn convert_float4(long4);
+float4 __ovld __cnfn convert_float4_rte(ulong4);
+float4 __ovld __cnfn convert_float4_rtz(ulong4);
+float4 __ovld __cnfn convert_float4_rtp(ulong4);
+float4 __ovld __cnfn convert_float4_rtn(ulong4);
+float4 __ovld __cnfn convert_float4(ulong4);
+float4 __ovld __cnfn convert_float4_rte(float4);
+float4 __ovld __cnfn convert_float4_rtz(float4);
+float4 __ovld __cnfn convert_float4_rtp(float4);
+float4 __ovld __cnfn convert_float4_rtn(float4);
+float4 __ovld __cnfn convert_float4(float4);
+char8 __ovld __cnfn convert_char8_rte(char8);
+char8 __ovld __cnfn convert_char8_sat_rte(char8);
+char8 __ovld __cnfn convert_char8_rtz(char8);
+char8 __ovld __cnfn convert_char8_sat_rtz(char8);
+char8 __ovld __cnfn convert_char8_rtp(char8);
+char8 __ovld __cnfn convert_char8_sat_rtp(char8);
+char8 __ovld __cnfn convert_char8_rtn(char8);
+char8 __ovld __cnfn convert_char8_sat_rtn(char8);
+char8 __ovld __cnfn convert_char8(char8);
+char8 __ovld __cnfn convert_char8_sat(char8);
+char8 __ovld __cnfn convert_char8_rte(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rte(uchar8);
+char8 __ovld __cnfn convert_char8_rtz(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtz(uchar8);
+char8 __ovld __cnfn convert_char8_rtp(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtp(uchar8);
+char8 __ovld __cnfn convert_char8_rtn(uchar8);
+char8 __ovld __cnfn convert_char8_sat_rtn(uchar8);
+char8 __ovld __cnfn convert_char8(uchar8);
+char8 __ovld __cnfn convert_char8_sat(uchar8);
+char8 __ovld __cnfn convert_char8_rte(short8);
+char8 __ovld __cnfn convert_char8_sat_rte(short8);
+char8 __ovld __cnfn convert_char8_rtz(short8);
+char8 __ovld __cnfn convert_char8_sat_rtz(short8);
+char8 __ovld __cnfn convert_char8_rtp(short8);
+char8 __ovld __cnfn convert_char8_sat_rtp(short8);
+char8 __ovld __cnfn convert_char8_rtn(short8);
+char8 __ovld __cnfn convert_char8_sat_rtn(short8);
+char8 __ovld __cnfn convert_char8(short8);
+char8 __ovld __cnfn convert_char8_sat(short8);
+char8 __ovld __cnfn convert_char8_rte(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rte(ushort8);
+char8 __ovld __cnfn convert_char8_rtz(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtz(ushort8);
+char8 __ovld __cnfn convert_char8_rtp(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtp(ushort8);
+char8 __ovld __cnfn convert_char8_rtn(ushort8);
+char8 __ovld __cnfn convert_char8_sat_rtn(ushort8);
+char8 __ovld __cnfn convert_char8(ushort8);
+char8 __ovld __cnfn convert_char8_sat(ushort8);
+char8 __ovld __cnfn convert_char8_rte(int8);
+char8 __ovld __cnfn convert_char8_sat_rte(int8);
+char8 __ovld __cnfn convert_char8_rtz(int8);
+char8 __ovld __cnfn convert_char8_sat_rtz(int8);
+char8 __ovld __cnfn convert_char8_rtp(int8);
+char8 __ovld __cnfn convert_char8_sat_rtp(int8);
+char8 __ovld __cnfn convert_char8_rtn(int8);
+char8 __ovld __cnfn convert_char8_sat_rtn(int8);
+char8 __ovld __cnfn convert_char8(int8);
+char8 __ovld __cnfn convert_char8_sat(int8);
+char8 __ovld __cnfn convert_char8_rte(uint8);
+char8 __ovld __cnfn convert_char8_sat_rte(uint8);
+char8 __ovld __cnfn convert_char8_rtz(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtz(uint8);
+char8 __ovld __cnfn convert_char8_rtp(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtp(uint8);
+char8 __ovld __cnfn convert_char8_rtn(uint8);
+char8 __ovld __cnfn convert_char8_sat_rtn(uint8);
+char8 __ovld __cnfn convert_char8(uint8);
+char8 __ovld __cnfn convert_char8_sat(uint8);
+char8 __ovld __cnfn convert_char8_rte(long8);
+char8 __ovld __cnfn convert_char8_sat_rte(long8);
+char8 __ovld __cnfn convert_char8_rtz(long8);
+char8 __ovld __cnfn convert_char8_sat_rtz(long8);
+char8 __ovld __cnfn convert_char8_rtp(long8);
+char8 __ovld __cnfn convert_char8_sat_rtp(long8);
+char8 __ovld __cnfn convert_char8_rtn(long8);
+char8 __ovld __cnfn convert_char8_sat_rtn(long8);
+char8 __ovld __cnfn convert_char8(long8);
+char8 __ovld __cnfn convert_char8_sat(long8);
+char8 __ovld __cnfn convert_char8_rte(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rte(ulong8);
+char8 __ovld __cnfn convert_char8_rtz(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtz(ulong8);
+char8 __ovld __cnfn convert_char8_rtp(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtp(ulong8);
+char8 __ovld __cnfn convert_char8_rtn(ulong8);
+char8 __ovld __cnfn convert_char8_sat_rtn(ulong8);
+char8 __ovld __cnfn convert_char8(ulong8);
+char8 __ovld __cnfn convert_char8_sat(ulong8);
+char8 __ovld __cnfn convert_char8_rte(float8);
+char8 __ovld __cnfn convert_char8_sat_rte(float8);
+char8 __ovld __cnfn convert_char8_rtz(float8);
+char8 __ovld __cnfn convert_char8_sat_rtz(float8);
+char8 __ovld __cnfn convert_char8_rtp(float8);
+char8 __ovld __cnfn convert_char8_sat_rtp(float8);
+char8 __ovld __cnfn convert_char8_rtn(float8);
+char8 __ovld __cnfn convert_char8_sat_rtn(float8);
+char8 __ovld __cnfn convert_char8(float8);
+char8 __ovld __cnfn convert_char8_sat(float8);
+uchar8 __ovld __cnfn convert_uchar8_rte(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(char8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(char8);
+uchar8 __ovld __cnfn convert_uchar8(char8);
+uchar8 __ovld __cnfn convert_uchar8_sat(char8);
+uchar8 __ovld __cnfn convert_uchar8_rte(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uchar8);
+uchar8 __ovld __cnfn convert_uchar8(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_sat(uchar8);
+uchar8 __ovld __cnfn convert_uchar8_rte(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(short8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(short8);
+uchar8 __ovld __cnfn convert_uchar8(short8);
+uchar8 __ovld __cnfn convert_uchar8_sat(short8);
+uchar8 __ovld __cnfn convert_uchar8_rte(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ushort8);
+uchar8 __ovld __cnfn convert_uchar8(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_sat(ushort8);
+uchar8 __ovld __cnfn convert_uchar8_rte(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(int8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(int8);
+uchar8 __ovld __cnfn convert_uchar8(int8);
+uchar8 __ovld __cnfn convert_uchar8_sat(int8);
+uchar8 __ovld __cnfn convert_uchar8_rte(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(uint8);
+uchar8 __ovld __cnfn convert_uchar8(uint8);
+uchar8 __ovld __cnfn convert_uchar8_sat(uint8);
+uchar8 __ovld __cnfn convert_uchar8_rte(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(long8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(long8);
+uchar8 __ovld __cnfn convert_uchar8(long8);
+uchar8 __ovld __cnfn convert_uchar8_sat(long8);
+uchar8 __ovld __cnfn convert_uchar8_rte(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(ulong8);
+uchar8 __ovld __cnfn convert_uchar8(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_sat(ulong8);
+uchar8 __ovld __cnfn convert_uchar8_rte(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(float8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(float8);
+uchar8 __ovld __cnfn convert_uchar8(float8);
+uchar8 __ovld __cnfn convert_uchar8_sat(float8);
+short8 __ovld __cnfn convert_short8_rte(char8);
+short8 __ovld __cnfn convert_short8_sat_rte(char8);
+short8 __ovld __cnfn convert_short8_rtz(char8);
+short8 __ovld __cnfn convert_short8_sat_rtz(char8);
+short8 __ovld __cnfn convert_short8_rtp(char8);
+short8 __ovld __cnfn convert_short8_sat_rtp(char8);
+short8 __ovld __cnfn convert_short8_rtn(char8);
+short8 __ovld __cnfn convert_short8_sat_rtn(char8);
+short8 __ovld __cnfn convert_short8(char8);
+short8 __ovld __cnfn convert_short8_sat(char8);
+short8 __ovld __cnfn convert_short8_rte(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rte(uchar8);
+short8 __ovld __cnfn convert_short8_rtz(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtz(uchar8);
+short8 __ovld __cnfn convert_short8_rtp(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtp(uchar8);
+short8 __ovld __cnfn convert_short8_rtn(uchar8);
+short8 __ovld __cnfn convert_short8_sat_rtn(uchar8);
+short8 __ovld __cnfn convert_short8(uchar8);
+short8 __ovld __cnfn convert_short8_sat(uchar8);
+short8 __ovld __cnfn convert_short8_rte(short8);
+short8 __ovld __cnfn convert_short8_sat_rte(short8);
+short8 __ovld __cnfn convert_short8_rtz(short8);
+short8 __ovld __cnfn convert_short8_sat_rtz(short8);
+short8 __ovld __cnfn convert_short8_rtp(short8);
+short8 __ovld __cnfn convert_short8_sat_rtp(short8);
+short8 __ovld __cnfn convert_short8_rtn(short8);
+short8 __ovld __cnfn convert_short8_sat_rtn(short8);
+short8 __ovld __cnfn convert_short8(short8);
+short8 __ovld __cnfn convert_short8_sat(short8);
+short8 __ovld __cnfn convert_short8_rte(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rte(ushort8);
+short8 __ovld __cnfn convert_short8_rtz(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtz(ushort8);
+short8 __ovld __cnfn convert_short8_rtp(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtp(ushort8);
+short8 __ovld __cnfn convert_short8_rtn(ushort8);
+short8 __ovld __cnfn convert_short8_sat_rtn(ushort8);
+short8 __ovld __cnfn convert_short8(ushort8);
+short8 __ovld __cnfn convert_short8_sat(ushort8);
+short8 __ovld __cnfn convert_short8_rte(int8);
+short8 __ovld __cnfn convert_short8_sat_rte(int8);
+short8 __ovld __cnfn convert_short8_rtz(int8);
+short8 __ovld __cnfn convert_short8_sat_rtz(int8);
+short8 __ovld __cnfn convert_short8_rtp(int8);
+short8 __ovld __cnfn convert_short8_sat_rtp(int8);
+short8 __ovld __cnfn convert_short8_rtn(int8);
+short8 __ovld __cnfn convert_short8_sat_rtn(int8);
+short8 __ovld __cnfn convert_short8(int8);
+short8 __ovld __cnfn convert_short8_sat(int8);
+short8 __ovld __cnfn convert_short8_rte(uint8);
+short8 __ovld __cnfn convert_short8_sat_rte(uint8);
+short8 __ovld __cnfn convert_short8_rtz(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtz(uint8);
+short8 __ovld __cnfn convert_short8_rtp(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtp(uint8);
+short8 __ovld __cnfn convert_short8_rtn(uint8);
+short8 __ovld __cnfn convert_short8_sat_rtn(uint8);
+short8 __ovld __cnfn convert_short8(uint8);
+short8 __ovld __cnfn convert_short8_sat(uint8);
+short8 __ovld __cnfn convert_short8_rte(long8);
+short8 __ovld __cnfn convert_short8_sat_rte(long8);
+short8 __ovld __cnfn convert_short8_rtz(long8);
+short8 __ovld __cnfn convert_short8_sat_rtz(long8);
+short8 __ovld __cnfn convert_short8_rtp(long8);
+short8 __ovld __cnfn convert_short8_sat_rtp(long8);
+short8 __ovld __cnfn convert_short8_rtn(long8);
+short8 __ovld __cnfn convert_short8_sat_rtn(long8);
+short8 __ovld __cnfn convert_short8(long8);
+short8 __ovld __cnfn convert_short8_sat(long8);
+short8 __ovld __cnfn convert_short8_rte(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rte(ulong8);
+short8 __ovld __cnfn convert_short8_rtz(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtz(ulong8);
+short8 __ovld __cnfn convert_short8_rtp(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtp(ulong8);
+short8 __ovld __cnfn convert_short8_rtn(ulong8);
+short8 __ovld __cnfn convert_short8_sat_rtn(ulong8);
+short8 __ovld __cnfn convert_short8(ulong8);
+short8 __ovld __cnfn convert_short8_sat(ulong8);
+short8 __ovld __cnfn convert_short8_rte(float8);
+short8 __ovld __cnfn convert_short8_sat_rte(float8);
+short8 __ovld __cnfn convert_short8_rtz(float8);
+short8 __ovld __cnfn convert_short8_sat_rtz(float8);
+short8 __ovld __cnfn convert_short8_rtp(float8);
+short8 __ovld __cnfn convert_short8_sat_rtp(float8);
+short8 __ovld __cnfn convert_short8_rtn(float8);
+short8 __ovld __cnfn convert_short8_sat_rtn(float8);
+short8 __ovld __cnfn convert_short8(float8);
+short8 __ovld __cnfn convert_short8_sat(float8);
+ushort8 __ovld __cnfn convert_ushort8_rte(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(char8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(char8);
+ushort8 __ovld __cnfn convert_ushort8(char8);
+ushort8 __ovld __cnfn convert_ushort8_sat(char8);
+ushort8 __ovld __cnfn convert_ushort8_rte(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uchar8);
+ushort8 __ovld __cnfn convert_ushort8(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_sat(uchar8);
+ushort8 __ovld __cnfn convert_ushort8_rte(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(short8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(short8);
+ushort8 __ovld __cnfn convert_ushort8(short8);
+ushort8 __ovld __cnfn convert_ushort8_sat(short8);
+ushort8 __ovld __cnfn convert_ushort8_rte(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ushort8);
+ushort8 __ovld __cnfn convert_ushort8(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_sat(ushort8);
+ushort8 __ovld __cnfn convert_ushort8_rte(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(int8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(int8);
+ushort8 __ovld __cnfn convert_ushort8(int8);
+ushort8 __ovld __cnfn convert_ushort8_sat(int8);
+ushort8 __ovld __cnfn convert_ushort8_rte(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(uint8);
+ushort8 __ovld __cnfn convert_ushort8(uint8);
+ushort8 __ovld __cnfn convert_ushort8_sat(uint8);
+ushort8 __ovld __cnfn convert_ushort8_rte(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(long8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(long8);
+ushort8 __ovld __cnfn convert_ushort8(long8);
+ushort8 __ovld __cnfn convert_ushort8_sat(long8);
+ushort8 __ovld __cnfn convert_ushort8_rte(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(ulong8);
+ushort8 __ovld __cnfn convert_ushort8(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_sat(ulong8);
+ushort8 __ovld __cnfn convert_ushort8_rte(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(float8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(float8);
+ushort8 __ovld __cnfn convert_ushort8(float8);
+ushort8 __ovld __cnfn convert_ushort8_sat(float8);
+int8 __ovld __cnfn convert_int8_rte(char8);
+int8 __ovld __cnfn convert_int8_sat_rte(char8);
+int8 __ovld __cnfn convert_int8_rtz(char8);
+int8 __ovld __cnfn convert_int8_sat_rtz(char8);
+int8 __ovld __cnfn convert_int8_rtp(char8);
+int8 __ovld __cnfn convert_int8_sat_rtp(char8);
+int8 __ovld __cnfn convert_int8_rtn(char8);
+int8 __ovld __cnfn convert_int8_sat_rtn(char8);
+int8 __ovld __cnfn convert_int8(char8);
+int8 __ovld __cnfn convert_int8_sat(char8);
+int8 __ovld __cnfn convert_int8_rte(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rte(uchar8);
+int8 __ovld __cnfn convert_int8_rtz(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtz(uchar8);
+int8 __ovld __cnfn convert_int8_rtp(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtp(uchar8);
+int8 __ovld __cnfn convert_int8_rtn(uchar8);
+int8 __ovld __cnfn convert_int8_sat_rtn(uchar8);
+int8 __ovld __cnfn convert_int8(uchar8);
+int8 __ovld __cnfn convert_int8_sat(uchar8);
+int8 __ovld __cnfn convert_int8_rte(short8);
+int8 __ovld __cnfn convert_int8_sat_rte(short8);
+int8 __ovld __cnfn convert_int8_rtz(short8);
+int8 __ovld __cnfn convert_int8_sat_rtz(short8);
+int8 __ovld __cnfn convert_int8_rtp(short8);
+int8 __ovld __cnfn convert_int8_sat_rtp(short8);
+int8 __ovld __cnfn convert_int8_rtn(short8);
+int8 __ovld __cnfn convert_int8_sat_rtn(short8);
+int8 __ovld __cnfn convert_int8(short8);
+int8 __ovld __cnfn convert_int8_sat(short8);
+int8 __ovld __cnfn convert_int8_rte(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rte(ushort8);
+int8 __ovld __cnfn convert_int8_rtz(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtz(ushort8);
+int8 __ovld __cnfn convert_int8_rtp(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtp(ushort8);
+int8 __ovld __cnfn convert_int8_rtn(ushort8);
+int8 __ovld __cnfn convert_int8_sat_rtn(ushort8);
+int8 __ovld __cnfn convert_int8(ushort8);
+int8 __ovld __cnfn convert_int8_sat(ushort8);
+int8 __ovld __cnfn convert_int8_rte(int8);
+int8 __ovld __cnfn convert_int8_sat_rte(int8);
+int8 __ovld __cnfn convert_int8_rtz(int8);
+int8 __ovld __cnfn convert_int8_sat_rtz(int8);
+int8 __ovld __cnfn convert_int8_rtp(int8);
+int8 __ovld __cnfn convert_int8_sat_rtp(int8);
+int8 __ovld __cnfn convert_int8_rtn(int8);
+int8 __ovld __cnfn convert_int8_sat_rtn(int8);
+int8 __ovld __cnfn convert_int8(int8);
+int8 __ovld __cnfn convert_int8_sat(int8);
+int8 __ovld __cnfn convert_int8_rte(uint8);
+int8 __ovld __cnfn convert_int8_sat_rte(uint8);
+int8 __ovld __cnfn convert_int8_rtz(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtz(uint8);
+int8 __ovld __cnfn convert_int8_rtp(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtp(uint8);
+int8 __ovld __cnfn convert_int8_rtn(uint8);
+int8 __ovld __cnfn convert_int8_sat_rtn(uint8);
+int8 __ovld __cnfn convert_int8(uint8);
+int8 __ovld __cnfn convert_int8_sat(uint8);
+int8 __ovld __cnfn convert_int8_rte(long8);
+int8 __ovld __cnfn convert_int8_sat_rte(long8);
+int8 __ovld __cnfn convert_int8_rtz(long8);
+int8 __ovld __cnfn convert_int8_sat_rtz(long8);
+int8 __ovld __cnfn convert_int8_rtp(long8);
+int8 __ovld __cnfn convert_int8_sat_rtp(long8);
+int8 __ovld __cnfn convert_int8_rtn(long8);
+int8 __ovld __cnfn convert_int8_sat_rtn(long8);
+int8 __ovld __cnfn convert_int8(long8);
+int8 __ovld __cnfn convert_int8_sat(long8);
+int8 __ovld __cnfn convert_int8_rte(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rte(ulong8);
+int8 __ovld __cnfn convert_int8_rtz(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtz(ulong8);
+int8 __ovld __cnfn convert_int8_rtp(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtp(ulong8);
+int8 __ovld __cnfn convert_int8_rtn(ulong8);
+int8 __ovld __cnfn convert_int8_sat_rtn(ulong8);
+int8 __ovld __cnfn convert_int8(ulong8);
+int8 __ovld __cnfn convert_int8_sat(ulong8);
+int8 __ovld __cnfn convert_int8_rte(float8);
+int8 __ovld __cnfn convert_int8_sat_rte(float8);
+int8 __ovld __cnfn convert_int8_rtz(float8);
+int8 __ovld __cnfn convert_int8_sat_rtz(float8);
+int8 __ovld __cnfn convert_int8_rtp(float8);
+int8 __ovld __cnfn convert_int8_sat_rtp(float8);
+int8 __ovld __cnfn convert_int8_rtn(float8);
+int8 __ovld __cnfn convert_int8_sat_rtn(float8);
+int8 __ovld __cnfn convert_int8(float8);
+int8 __ovld __cnfn convert_int8_sat(float8);
+uint8 __ovld __cnfn convert_uint8_rte(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(char8);
+uint8 __ovld __cnfn convert_uint8_rtz(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(char8);
+uint8 __ovld __cnfn convert_uint8_rtp(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(char8);
+uint8 __ovld __cnfn convert_uint8_rtn(char8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(char8);
+uint8 __ovld __cnfn convert_uint8(char8);
+uint8 __ovld __cnfn convert_uint8_sat(char8);
+uint8 __ovld __cnfn convert_uint8_rte(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtz(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtp(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(uchar8);
+uint8 __ovld __cnfn convert_uint8_rtn(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(uchar8);
+uint8 __ovld __cnfn convert_uint8(uchar8);
+uint8 __ovld __cnfn convert_uint8_sat(uchar8);
+uint8 __ovld __cnfn convert_uint8_rte(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(short8);
+uint8 __ovld __cnfn convert_uint8_rtz(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(short8);
+uint8 __ovld __cnfn convert_uint8_rtp(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(short8);
+uint8 __ovld __cnfn convert_uint8_rtn(short8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(short8);
+uint8 __ovld __cnfn convert_uint8(short8);
+uint8 __ovld __cnfn convert_uint8_sat(short8);
+uint8 __ovld __cnfn convert_uint8_rte(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtz(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtp(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(ushort8);
+uint8 __ovld __cnfn convert_uint8_rtn(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(ushort8);
+uint8 __ovld __cnfn convert_uint8(ushort8);
+uint8 __ovld __cnfn convert_uint8_sat(ushort8);
+uint8 __ovld __cnfn convert_uint8_rte(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(int8);
+uint8 __ovld __cnfn convert_uint8_rtz(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(int8);
+uint8 __ovld __cnfn convert_uint8_rtp(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(int8);
+uint8 __ovld __cnfn convert_uint8_rtn(int8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(int8);
+uint8 __ovld __cnfn convert_uint8(int8);
+uint8 __ovld __cnfn convert_uint8_sat(int8);
+uint8 __ovld __cnfn convert_uint8_rte(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(uint8);
+uint8 __ovld __cnfn convert_uint8_rtz(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(uint8);
+uint8 __ovld __cnfn convert_uint8_rtp(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(uint8);
+uint8 __ovld __cnfn convert_uint8_rtn(uint8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(uint8);
+uint8 __ovld __cnfn convert_uint8(uint8);
+uint8 __ovld __cnfn convert_uint8_sat(uint8);
+uint8 __ovld __cnfn convert_uint8_rte(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(long8);
+uint8 __ovld __cnfn convert_uint8_rtz(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(long8);
+uint8 __ovld __cnfn convert_uint8_rtp(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(long8);
+uint8 __ovld __cnfn convert_uint8_rtn(long8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(long8);
+uint8 __ovld __cnfn convert_uint8(long8);
+uint8 __ovld __cnfn convert_uint8_sat(long8);
+uint8 __ovld __cnfn convert_uint8_rte(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtz(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtp(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(ulong8);
+uint8 __ovld __cnfn convert_uint8_rtn(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(ulong8);
+uint8 __ovld __cnfn convert_uint8(ulong8);
+uint8 __ovld __cnfn convert_uint8_sat(ulong8);
+uint8 __ovld __cnfn convert_uint8_rte(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(float8);
+uint8 __ovld __cnfn convert_uint8_rtz(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(float8);
+uint8 __ovld __cnfn convert_uint8_rtp(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(float8);
+uint8 __ovld __cnfn convert_uint8_rtn(float8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(float8);
+uint8 __ovld __cnfn convert_uint8(float8);
+uint8 __ovld __cnfn convert_uint8_sat(float8);
+long8 __ovld __cnfn convert_long8_rte(char8);
+long8 __ovld __cnfn convert_long8_sat_rte(char8);
+long8 __ovld __cnfn convert_long8_rtz(char8);
+long8 __ovld __cnfn convert_long8_sat_rtz(char8);
+long8 __ovld __cnfn convert_long8_rtp(char8);
+long8 __ovld __cnfn convert_long8_sat_rtp(char8);
+long8 __ovld __cnfn convert_long8_rtn(char8);
+long8 __ovld __cnfn convert_long8_sat_rtn(char8);
+long8 __ovld __cnfn convert_long8(char8);
+long8 __ovld __cnfn convert_long8_sat(char8);
+long8 __ovld __cnfn convert_long8_rte(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rte(uchar8);
+long8 __ovld __cnfn convert_long8_rtz(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtz(uchar8);
+long8 __ovld __cnfn convert_long8_rtp(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtp(uchar8);
+long8 __ovld __cnfn convert_long8_rtn(uchar8);
+long8 __ovld __cnfn convert_long8_sat_rtn(uchar8);
+long8 __ovld __cnfn convert_long8(uchar8);
+long8 __ovld __cnfn convert_long8_sat(uchar8);
+long8 __ovld __cnfn convert_long8_rte(short8);
+long8 __ovld __cnfn convert_long8_sat_rte(short8);
+long8 __ovld __cnfn convert_long8_rtz(short8);
+long8 __ovld __cnfn convert_long8_sat_rtz(short8);
+long8 __ovld __cnfn convert_long8_rtp(short8);
+long8 __ovld __cnfn convert_long8_sat_rtp(short8);
+long8 __ovld __cnfn convert_long8_rtn(short8);
+long8 __ovld __cnfn convert_long8_sat_rtn(short8);
+long8 __ovld __cnfn convert_long8(short8);
+long8 __ovld __cnfn convert_long8_sat(short8);
+long8 __ovld __cnfn convert_long8_rte(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rte(ushort8);
+long8 __ovld __cnfn convert_long8_rtz(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtz(ushort8);
+long8 __ovld __cnfn convert_long8_rtp(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtp(ushort8);
+long8 __ovld __cnfn convert_long8_rtn(ushort8);
+long8 __ovld __cnfn convert_long8_sat_rtn(ushort8);
+long8 __ovld __cnfn convert_long8(ushort8);
+long8 __ovld __cnfn convert_long8_sat(ushort8);
+long8 __ovld __cnfn convert_long8_rte(int8);
+long8 __ovld __cnfn convert_long8_sat_rte(int8);
+long8 __ovld __cnfn convert_long8_rtz(int8);
+long8 __ovld __cnfn convert_long8_sat_rtz(int8);
+long8 __ovld __cnfn convert_long8_rtp(int8);
+long8 __ovld __cnfn convert_long8_sat_rtp(int8);
+long8 __ovld __cnfn convert_long8_rtn(int8);
+long8 __ovld __cnfn convert_long8_sat_rtn(int8);
+long8 __ovld __cnfn convert_long8(int8);
+long8 __ovld __cnfn convert_long8_sat(int8);
+long8 __ovld __cnfn convert_long8_rte(uint8);
+long8 __ovld __cnfn convert_long8_sat_rte(uint8);
+long8 __ovld __cnfn convert_long8_rtz(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtz(uint8);
+long8 __ovld __cnfn convert_long8_rtp(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtp(uint8);
+long8 __ovld __cnfn convert_long8_rtn(uint8);
+long8 __ovld __cnfn convert_long8_sat_rtn(uint8);
+long8 __ovld __cnfn convert_long8(uint8);
+long8 __ovld __cnfn convert_long8_sat(uint8);
+long8 __ovld __cnfn convert_long8_rte(long8);
+long8 __ovld __cnfn convert_long8_sat_rte(long8);
+long8 __ovld __cnfn convert_long8_rtz(long8);
+long8 __ovld __cnfn convert_long8_sat_rtz(long8);
+long8 __ovld __cnfn convert_long8_rtp(long8);
+long8 __ovld __cnfn convert_long8_sat_rtp(long8);
+long8 __ovld __cnfn convert_long8_rtn(long8);
+long8 __ovld __cnfn convert_long8_sat_rtn(long8);
+long8 __ovld __cnfn convert_long8(long8);
+long8 __ovld __cnfn convert_long8_sat(long8);
+long8 __ovld __cnfn convert_long8_rte(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rte(ulong8);
+long8 __ovld __cnfn convert_long8_rtz(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtz(ulong8);
+long8 __ovld __cnfn convert_long8_rtp(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtp(ulong8);
+long8 __ovld __cnfn convert_long8_rtn(ulong8);
+long8 __ovld __cnfn convert_long8_sat_rtn(ulong8);
+long8 __ovld __cnfn convert_long8(ulong8);
+long8 __ovld __cnfn convert_long8_sat(ulong8);
+long8 __ovld __cnfn convert_long8_rte(float8);
+long8 __ovld __cnfn convert_long8_sat_rte(float8);
+long8 __ovld __cnfn convert_long8_rtz(float8);
+long8 __ovld __cnfn convert_long8_sat_rtz(float8);
+long8 __ovld __cnfn convert_long8_rtp(float8);
+long8 __ovld __cnfn convert_long8_sat_rtp(float8);
+long8 __ovld __cnfn convert_long8_rtn(float8);
+long8 __ovld __cnfn convert_long8_sat_rtn(float8);
+long8 __ovld __cnfn convert_long8(float8);
+long8 __ovld __cnfn convert_long8_sat(float8);
+ulong8 __ovld __cnfn convert_ulong8_rte(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(char8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(char8);
+ulong8 __ovld __cnfn convert_ulong8(char8);
+ulong8 __ovld __cnfn convert_ulong8_sat(char8);
+ulong8 __ovld __cnfn convert_ulong8_rte(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uchar8);
+ulong8 __ovld __cnfn convert_ulong8(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_sat(uchar8);
+ulong8 __ovld __cnfn convert_ulong8_rte(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(short8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(short8);
+ulong8 __ovld __cnfn convert_ulong8(short8);
+ulong8 __ovld __cnfn convert_ulong8_sat(short8);
+ulong8 __ovld __cnfn convert_ulong8_rte(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ushort8);
+ulong8 __ovld __cnfn convert_ulong8(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_sat(ushort8);
+ulong8 __ovld __cnfn convert_ulong8_rte(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(int8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(int8);
+ulong8 __ovld __cnfn convert_ulong8(int8);
+ulong8 __ovld __cnfn convert_ulong8_sat(int8);
+ulong8 __ovld __cnfn convert_ulong8_rte(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(uint8);
+ulong8 __ovld __cnfn convert_ulong8(uint8);
+ulong8 __ovld __cnfn convert_ulong8_sat(uint8);
+ulong8 __ovld __cnfn convert_ulong8_rte(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(long8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(long8);
+ulong8 __ovld __cnfn convert_ulong8(long8);
+ulong8 __ovld __cnfn convert_ulong8_sat(long8);
+ulong8 __ovld __cnfn convert_ulong8_rte(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(ulong8);
+ulong8 __ovld __cnfn convert_ulong8(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_sat(ulong8);
+ulong8 __ovld __cnfn convert_ulong8_rte(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(float8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(float8);
+ulong8 __ovld __cnfn convert_ulong8(float8);
+ulong8 __ovld __cnfn convert_ulong8_sat(float8);
+float8 __ovld __cnfn convert_float8_rte(char8);
+float8 __ovld __cnfn convert_float8_rtz(char8);
+float8 __ovld __cnfn convert_float8_rtp(char8);
+float8 __ovld __cnfn convert_float8_rtn(char8);
+float8 __ovld __cnfn convert_float8(char8);
+float8 __ovld __cnfn convert_float8_rte(uchar8);
+float8 __ovld __cnfn convert_float8_rtz(uchar8);
+float8 __ovld __cnfn convert_float8_rtp(uchar8);
+float8 __ovld __cnfn convert_float8_rtn(uchar8);
+float8 __ovld __cnfn convert_float8(uchar8);
+float8 __ovld __cnfn convert_float8_rte(short8);
+float8 __ovld __cnfn convert_float8_rtz(short8);
+float8 __ovld __cnfn convert_float8_rtp(short8);
+float8 __ovld __cnfn convert_float8_rtn(short8);
+float8 __ovld __cnfn convert_float8(short8);
+float8 __ovld __cnfn convert_float8_rte(ushort8);
+float8 __ovld __cnfn convert_float8_rtz(ushort8);
+float8 __ovld __cnfn convert_float8_rtp(ushort8);
+float8 __ovld __cnfn convert_float8_rtn(ushort8);
+float8 __ovld __cnfn convert_float8(ushort8);
+float8 __ovld __cnfn convert_float8_rte(int8);
+float8 __ovld __cnfn convert_float8_rtz(int8);
+float8 __ovld __cnfn convert_float8_rtp(int8);
+float8 __ovld __cnfn convert_float8_rtn(int8);
+float8 __ovld __cnfn convert_float8(int8);
+float8 __ovld __cnfn convert_float8_rte(uint8);
+float8 __ovld __cnfn convert_float8_rtz(uint8);
+float8 __ovld __cnfn convert_float8_rtp(uint8);
+float8 __ovld __cnfn convert_float8_rtn(uint8);
+float8 __ovld __cnfn convert_float8(uint8);
+float8 __ovld __cnfn convert_float8_rte(long8);
+float8 __ovld __cnfn convert_float8_rtz(long8);
+float8 __ovld __cnfn convert_float8_rtp(long8);
+float8 __ovld __cnfn convert_float8_rtn(long8);
+float8 __ovld __cnfn convert_float8(long8);
+float8 __ovld __cnfn convert_float8_rte(ulong8);
+float8 __ovld __cnfn convert_float8_rtz(ulong8);
+float8 __ovld __cnfn convert_float8_rtp(ulong8);
+float8 __ovld __cnfn convert_float8_rtn(ulong8);
+float8 __ovld __cnfn convert_float8(ulong8);
+float8 __ovld __cnfn convert_float8_rte(float8);
+float8 __ovld __cnfn convert_float8_rtz(float8);
+float8 __ovld __cnfn convert_float8_rtp(float8);
+float8 __ovld __cnfn convert_float8_rtn(float8);
+float8 __ovld __cnfn convert_float8(float8);
+char16 __ovld __cnfn convert_char16_rte(char16);
+char16 __ovld __cnfn convert_char16_sat_rte(char16);
+char16 __ovld __cnfn convert_char16_rtz(char16);
+char16 __ovld __cnfn convert_char16_sat_rtz(char16);
+char16 __ovld __cnfn convert_char16_rtp(char16);
+char16 __ovld __cnfn convert_char16_sat_rtp(char16);
+char16 __ovld __cnfn convert_char16_rtn(char16);
+char16 __ovld __cnfn convert_char16_sat_rtn(char16);
+char16 __ovld __cnfn convert_char16(char16);
+char16 __ovld __cnfn convert_char16_sat(char16);
+char16 __ovld __cnfn convert_char16_rte(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rte(uchar16);
+char16 __ovld __cnfn convert_char16_rtz(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtz(uchar16);
+char16 __ovld __cnfn convert_char16_rtp(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtp(uchar16);
+char16 __ovld __cnfn convert_char16_rtn(uchar16);
+char16 __ovld __cnfn convert_char16_sat_rtn(uchar16);
+char16 __ovld __cnfn convert_char16(uchar16);
+char16 __ovld __cnfn convert_char16_sat(uchar16);
+char16 __ovld __cnfn convert_char16_rte(short16);
+char16 __ovld __cnfn convert_char16_sat_rte(short16);
+char16 __ovld __cnfn convert_char16_rtz(short16);
+char16 __ovld __cnfn convert_char16_sat_rtz(short16);
+char16 __ovld __cnfn convert_char16_rtp(short16);
+char16 __ovld __cnfn convert_char16_sat_rtp(short16);
+char16 __ovld __cnfn convert_char16_rtn(short16);
+char16 __ovld __cnfn convert_char16_sat_rtn(short16);
+char16 __ovld __cnfn convert_char16(short16);
+char16 __ovld __cnfn convert_char16_sat(short16);
+char16 __ovld __cnfn convert_char16_rte(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rte(ushort16);
+char16 __ovld __cnfn convert_char16_rtz(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtz(ushort16);
+char16 __ovld __cnfn convert_char16_rtp(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtp(ushort16);
+char16 __ovld __cnfn convert_char16_rtn(ushort16);
+char16 __ovld __cnfn convert_char16_sat_rtn(ushort16);
+char16 __ovld __cnfn convert_char16(ushort16);
+char16 __ovld __cnfn convert_char16_sat(ushort16);
+char16 __ovld __cnfn convert_char16_rte(int16);
+char16 __ovld __cnfn convert_char16_sat_rte(int16);
+char16 __ovld __cnfn convert_char16_rtz(int16);
+char16 __ovld __cnfn convert_char16_sat_rtz(int16);
+char16 __ovld __cnfn convert_char16_rtp(int16);
+char16 __ovld __cnfn convert_char16_sat_rtp(int16);
+char16 __ovld __cnfn convert_char16_rtn(int16);
+char16 __ovld __cnfn convert_char16_sat_rtn(int16);
+char16 __ovld __cnfn convert_char16(int16);
+char16 __ovld __cnfn convert_char16_sat(int16);
+char16 __ovld __cnfn convert_char16_rte(uint16);
+char16 __ovld __cnfn convert_char16_sat_rte(uint16);
+char16 __ovld __cnfn convert_char16_rtz(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtz(uint16);
+char16 __ovld __cnfn convert_char16_rtp(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtp(uint16);
+char16 __ovld __cnfn convert_char16_rtn(uint16);
+char16 __ovld __cnfn convert_char16_sat_rtn(uint16);
+char16 __ovld __cnfn convert_char16(uint16);
+char16 __ovld __cnfn convert_char16_sat(uint16);
+char16 __ovld __cnfn convert_char16_rte(long16);
+char16 __ovld __cnfn convert_char16_sat_rte(long16);
+char16 __ovld __cnfn convert_char16_rtz(long16);
+char16 __ovld __cnfn convert_char16_sat_rtz(long16);
+char16 __ovld __cnfn convert_char16_rtp(long16);
+char16 __ovld __cnfn convert_char16_sat_rtp(long16);
+char16 __ovld __cnfn convert_char16_rtn(long16);
+char16 __ovld __cnfn convert_char16_sat_rtn(long16);
+char16 __ovld __cnfn convert_char16(long16);
+char16 __ovld __cnfn convert_char16_sat(long16);
+char16 __ovld __cnfn convert_char16_rte(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rte(ulong16);
+char16 __ovld __cnfn convert_char16_rtz(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtz(ulong16);
+char16 __ovld __cnfn convert_char16_rtp(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtp(ulong16);
+char16 __ovld __cnfn convert_char16_rtn(ulong16);
+char16 __ovld __cnfn convert_char16_sat_rtn(ulong16);
+char16 __ovld __cnfn convert_char16(ulong16);
+char16 __ovld __cnfn convert_char16_sat(ulong16);
+char16 __ovld __cnfn convert_char16_rte(float16);
+char16 __ovld __cnfn convert_char16_sat_rte(float16);
+char16 __ovld __cnfn convert_char16_rtz(float16);
+char16 __ovld __cnfn convert_char16_sat_rtz(float16);
+char16 __ovld __cnfn convert_char16_rtp(float16);
+char16 __ovld __cnfn convert_char16_sat_rtp(float16);
+char16 __ovld __cnfn convert_char16_rtn(float16);
+char16 __ovld __cnfn convert_char16_sat_rtn(float16);
+char16 __ovld __cnfn convert_char16(float16);
+char16 __ovld __cnfn convert_char16_sat(float16);
+uchar16 __ovld __cnfn convert_uchar16_rte(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(char16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(char16);
+uchar16 __ovld __cnfn convert_uchar16(char16);
+uchar16 __ovld __cnfn convert_uchar16_sat(char16);
+uchar16 __ovld __cnfn convert_uchar16_rte(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uchar16);
+uchar16 __ovld __cnfn convert_uchar16(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_sat(uchar16);
+uchar16 __ovld __cnfn convert_uchar16_rte(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(short16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(short16);
+uchar16 __ovld __cnfn convert_uchar16(short16);
+uchar16 __ovld __cnfn convert_uchar16_sat(short16);
+uchar16 __ovld __cnfn convert_uchar16_rte(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ushort16);
+uchar16 __ovld __cnfn convert_uchar16(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_sat(ushort16);
+uchar16 __ovld __cnfn convert_uchar16_rte(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(int16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(int16);
+uchar16 __ovld __cnfn convert_uchar16(int16);
+uchar16 __ovld __cnfn convert_uchar16_sat(int16);
+uchar16 __ovld __cnfn convert_uchar16_rte(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(uint16);
+uchar16 __ovld __cnfn convert_uchar16(uint16);
+uchar16 __ovld __cnfn convert_uchar16_sat(uint16);
+uchar16 __ovld __cnfn convert_uchar16_rte(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(long16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(long16);
+uchar16 __ovld __cnfn convert_uchar16(long16);
+uchar16 __ovld __cnfn convert_uchar16_sat(long16);
+uchar16 __ovld __cnfn convert_uchar16_rte(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(ulong16);
+uchar16 __ovld __cnfn convert_uchar16(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_sat(ulong16);
+uchar16 __ovld __cnfn convert_uchar16_rte(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(float16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(float16);
+uchar16 __ovld __cnfn convert_uchar16(float16);
+uchar16 __ovld __cnfn convert_uchar16_sat(float16);
+short16 __ovld __cnfn convert_short16_rte(char16);
+short16 __ovld __cnfn convert_short16_sat_rte(char16);
+short16 __ovld __cnfn convert_short16_rtz(char16);
+short16 __ovld __cnfn convert_short16_sat_rtz(char16);
+short16 __ovld __cnfn convert_short16_rtp(char16);
+short16 __ovld __cnfn convert_short16_sat_rtp(char16);
+short16 __ovld __cnfn convert_short16_rtn(char16);
+short16 __ovld __cnfn convert_short16_sat_rtn(char16);
+short16 __ovld __cnfn convert_short16(char16);
+short16 __ovld __cnfn convert_short16_sat(char16);
+short16 __ovld __cnfn convert_short16_rte(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rte(uchar16);
+short16 __ovld __cnfn convert_short16_rtz(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtz(uchar16);
+short16 __ovld __cnfn convert_short16_rtp(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtp(uchar16);
+short16 __ovld __cnfn convert_short16_rtn(uchar16);
+short16 __ovld __cnfn convert_short16_sat_rtn(uchar16);
+short16 __ovld __cnfn convert_short16(uchar16);
+short16 __ovld __cnfn convert_short16_sat(uchar16);
+short16 __ovld __cnfn convert_short16_rte(short16);
+short16 __ovld __cnfn convert_short16_sat_rte(short16);
+short16 __ovld __cnfn convert_short16_rtz(short16);
+short16 __ovld __cnfn convert_short16_sat_rtz(short16);
+short16 __ovld __cnfn convert_short16_rtp(short16);
+short16 __ovld __cnfn convert_short16_sat_rtp(short16);
+short16 __ovld __cnfn convert_short16_rtn(short16);
+short16 __ovld __cnfn convert_short16_sat_rtn(short16);
+short16 __ovld __cnfn convert_short16(short16);
+short16 __ovld __cnfn convert_short16_sat(short16);
+short16 __ovld __cnfn convert_short16_rte(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rte(ushort16);
+short16 __ovld __cnfn convert_short16_rtz(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtz(ushort16);
+short16 __ovld __cnfn convert_short16_rtp(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtp(ushort16);
+short16 __ovld __cnfn convert_short16_rtn(ushort16);
+short16 __ovld __cnfn convert_short16_sat_rtn(ushort16);
+short16 __ovld __cnfn convert_short16(ushort16);
+short16 __ovld __cnfn convert_short16_sat(ushort16);
+short16 __ovld __cnfn convert_short16_rte(int16);
+short16 __ovld __cnfn convert_short16_sat_rte(int16);
+short16 __ovld __cnfn convert_short16_rtz(int16);
+short16 __ovld __cnfn convert_short16_sat_rtz(int16);
+short16 __ovld __cnfn convert_short16_rtp(int16);
+short16 __ovld __cnfn convert_short16_sat_rtp(int16);
+short16 __ovld __cnfn convert_short16_rtn(int16);
+short16 __ovld __cnfn convert_short16_sat_rtn(int16);
+short16 __ovld __cnfn convert_short16(int16);
+short16 __ovld __cnfn convert_short16_sat(int16);
+short16 __ovld __cnfn convert_short16_rte(uint16);
+short16 __ovld __cnfn convert_short16_sat_rte(uint16);
+short16 __ovld __cnfn convert_short16_rtz(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtz(uint16);
+short16 __ovld __cnfn convert_short16_rtp(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtp(uint16);
+short16 __ovld __cnfn convert_short16_rtn(uint16);
+short16 __ovld __cnfn convert_short16_sat_rtn(uint16);
+short16 __ovld __cnfn convert_short16(uint16);
+short16 __ovld __cnfn convert_short16_sat(uint16);
+short16 __ovld __cnfn convert_short16_rte(long16);
+short16 __ovld __cnfn convert_short16_sat_rte(long16);
+short16 __ovld __cnfn convert_short16_rtz(long16);
+short16 __ovld __cnfn convert_short16_sat_rtz(long16);
+short16 __ovld __cnfn convert_short16_rtp(long16);
+short16 __ovld __cnfn convert_short16_sat_rtp(long16);
+short16 __ovld __cnfn convert_short16_rtn(long16);
+short16 __ovld __cnfn convert_short16_sat_rtn(long16);
+short16 __ovld __cnfn convert_short16(long16);
+short16 __ovld __cnfn convert_short16_sat(long16);
+short16 __ovld __cnfn convert_short16_rte(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rte(ulong16);
+short16 __ovld __cnfn convert_short16_rtz(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtz(ulong16);
+short16 __ovld __cnfn convert_short16_rtp(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtp(ulong16);
+short16 __ovld __cnfn convert_short16_rtn(ulong16);
+short16 __ovld __cnfn convert_short16_sat_rtn(ulong16);
+short16 __ovld __cnfn convert_short16(ulong16);
+short16 __ovld __cnfn convert_short16_sat(ulong16);
+short16 __ovld __cnfn convert_short16_rte(float16);
+short16 __ovld __cnfn convert_short16_sat_rte(float16);
+short16 __ovld __cnfn convert_short16_rtz(float16);
+short16 __ovld __cnfn convert_short16_sat_rtz(float16);
+short16 __ovld __cnfn convert_short16_rtp(float16);
+short16 __ovld __cnfn convert_short16_sat_rtp(float16);
+short16 __ovld __cnfn convert_short16_rtn(float16);
+short16 __ovld __cnfn convert_short16_sat_rtn(float16);
+short16 __ovld __cnfn convert_short16(float16);
+short16 __ovld __cnfn convert_short16_sat(float16);
+ushort16 __ovld __cnfn convert_ushort16_rte(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(char16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(char16);
+ushort16 __ovld __cnfn convert_ushort16(char16);
+ushort16 __ovld __cnfn convert_ushort16_sat(char16);
+ushort16 __ovld __cnfn convert_ushort16_rte(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uchar16);
+ushort16 __ovld __cnfn convert_ushort16(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_sat(uchar16);
+ushort16 __ovld __cnfn convert_ushort16_rte(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(short16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(short16);
+ushort16 __ovld __cnfn convert_ushort16(short16);
+ushort16 __ovld __cnfn convert_ushort16_sat(short16);
+ushort16 __ovld __cnfn convert_ushort16_rte(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ushort16);
+ushort16 __ovld __cnfn convert_ushort16(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_sat(ushort16);
+ushort16 __ovld __cnfn convert_ushort16_rte(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(int16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(int16);
+ushort16 __ovld __cnfn convert_ushort16(int16);
+ushort16 __ovld __cnfn convert_ushort16_sat(int16);
+ushort16 __ovld __cnfn convert_ushort16_rte(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(uint16);
+ushort16 __ovld __cnfn convert_ushort16(uint16);
+ushort16 __ovld __cnfn convert_ushort16_sat(uint16);
+ushort16 __ovld __cnfn convert_ushort16_rte(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(long16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(long16);
+ushort16 __ovld __cnfn convert_ushort16(long16);
+ushort16 __ovld __cnfn convert_ushort16_sat(long16);
+ushort16 __ovld __cnfn convert_ushort16_rte(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(ulong16);
+ushort16 __ovld __cnfn convert_ushort16(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_sat(ulong16);
+ushort16 __ovld __cnfn convert_ushort16_rte(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(float16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(float16);
+ushort16 __ovld __cnfn convert_ushort16(float16);
+ushort16 __ovld __cnfn convert_ushort16_sat(float16);
+int16 __ovld __cnfn convert_int16_rte(char16);
+int16 __ovld __cnfn convert_int16_sat_rte(char16);
+int16 __ovld __cnfn convert_int16_rtz(char16);
+int16 __ovld __cnfn convert_int16_sat_rtz(char16);
+int16 __ovld __cnfn convert_int16_rtp(char16);
+int16 __ovld __cnfn convert_int16_sat_rtp(char16);
+int16 __ovld __cnfn convert_int16_rtn(char16);
+int16 __ovld __cnfn convert_int16_sat_rtn(char16);
+int16 __ovld __cnfn convert_int16(char16);
+int16 __ovld __cnfn convert_int16_sat(char16);
+int16 __ovld __cnfn convert_int16_rte(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rte(uchar16);
+int16 __ovld __cnfn convert_int16_rtz(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtz(uchar16);
+int16 __ovld __cnfn convert_int16_rtp(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtp(uchar16);
+int16 __ovld __cnfn convert_int16_rtn(uchar16);
+int16 __ovld __cnfn convert_int16_sat_rtn(uchar16);
+int16 __ovld __cnfn convert_int16(uchar16);
+int16 __ovld __cnfn convert_int16_sat(uchar16);
+int16 __ovld __cnfn convert_int16_rte(short16);
+int16 __ovld __cnfn convert_int16_sat_rte(short16);
+int16 __ovld __cnfn convert_int16_rtz(short16);
+int16 __ovld __cnfn convert_int16_sat_rtz(short16);
+int16 __ovld __cnfn convert_int16_rtp(short16);
+int16 __ovld __cnfn convert_int16_sat_rtp(short16);
+int16 __ovld __cnfn convert_int16_rtn(short16);
+int16 __ovld __cnfn convert_int16_sat_rtn(short16);
+int16 __ovld __cnfn convert_int16(short16);
+int16 __ovld __cnfn convert_int16_sat(short16);
+int16 __ovld __cnfn convert_int16_rte(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rte(ushort16);
+int16 __ovld __cnfn convert_int16_rtz(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtz(ushort16);
+int16 __ovld __cnfn convert_int16_rtp(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtp(ushort16);
+int16 __ovld __cnfn convert_int16_rtn(ushort16);
+int16 __ovld __cnfn convert_int16_sat_rtn(ushort16);
+int16 __ovld __cnfn convert_int16(ushort16);
+int16 __ovld __cnfn convert_int16_sat(ushort16);
+int16 __ovld __cnfn convert_int16_rte(int16);
+int16 __ovld __cnfn convert_int16_sat_rte(int16);
+int16 __ovld __cnfn convert_int16_rtz(int16);
+int16 __ovld __cnfn convert_int16_sat_rtz(int16);
+int16 __ovld __cnfn convert_int16_rtp(int16);
+int16 __ovld __cnfn convert_int16_sat_rtp(int16);
+int16 __ovld __cnfn convert_int16_rtn(int16);
+int16 __ovld __cnfn convert_int16_sat_rtn(int16);
+int16 __ovld __cnfn convert_int16(int16);
+int16 __ovld __cnfn convert_int16_sat(int16);
+int16 __ovld __cnfn convert_int16_rte(uint16);
+int16 __ovld __cnfn convert_int16_sat_rte(uint16);
+int16 __ovld __cnfn convert_int16_rtz(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtz(uint16);
+int16 __ovld __cnfn convert_int16_rtp(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtp(uint16);
+int16 __ovld __cnfn convert_int16_rtn(uint16);
+int16 __ovld __cnfn convert_int16_sat_rtn(uint16);
+int16 __ovld __cnfn convert_int16(uint16);
+int16 __ovld __cnfn convert_int16_sat(uint16);
+int16 __ovld __cnfn convert_int16_rte(long16);
+int16 __ovld __cnfn convert_int16_sat_rte(long16);
+int16 __ovld __cnfn convert_int16_rtz(long16);
+int16 __ovld __cnfn convert_int16_sat_rtz(long16);
+int16 __ovld __cnfn convert_int16_rtp(long16);
+int16 __ovld __cnfn convert_int16_sat_rtp(long16);
+int16 __ovld __cnfn convert_int16_rtn(long16);
+int16 __ovld __cnfn convert_int16_sat_rtn(long16);
+int16 __ovld __cnfn convert_int16(long16);
+int16 __ovld __cnfn convert_int16_sat(long16);
+int16 __ovld __cnfn convert_int16_rte(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rte(ulong16);
+int16 __ovld __cnfn convert_int16_rtz(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtz(ulong16);
+int16 __ovld __cnfn convert_int16_rtp(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtp(ulong16);
+int16 __ovld __cnfn convert_int16_rtn(ulong16);
+int16 __ovld __cnfn convert_int16_sat_rtn(ulong16);
+int16 __ovld __cnfn convert_int16(ulong16);
+int16 __ovld __cnfn convert_int16_sat(ulong16);
+int16 __ovld __cnfn convert_int16_rte(float16);
+int16 __ovld __cnfn convert_int16_sat_rte(float16);
+int16 __ovld __cnfn convert_int16_rtz(float16);
+int16 __ovld __cnfn convert_int16_sat_rtz(float16);
+int16 __ovld __cnfn convert_int16_rtp(float16);
+int16 __ovld __cnfn convert_int16_sat_rtp(float16);
+int16 __ovld __cnfn convert_int16_rtn(float16);
+int16 __ovld __cnfn convert_int16_sat_rtn(float16);
+int16 __ovld __cnfn convert_int16(float16);
+int16 __ovld __cnfn convert_int16_sat(float16);
+uint16 __ovld __cnfn convert_uint16_rte(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(char16);
+uint16 __ovld __cnfn convert_uint16_rtz(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(char16);
+uint16 __ovld __cnfn convert_uint16_rtp(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(char16);
+uint16 __ovld __cnfn convert_uint16_rtn(char16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(char16);
+uint16 __ovld __cnfn convert_uint16(char16);
+uint16 __ovld __cnfn convert_uint16_sat(char16);
+uint16 __ovld __cnfn convert_uint16_rte(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtz(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtp(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(uchar16);
+uint16 __ovld __cnfn convert_uint16_rtn(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(uchar16);
+uint16 __ovld __cnfn convert_uint16(uchar16);
+uint16 __ovld __cnfn convert_uint16_sat(uchar16);
+uint16 __ovld __cnfn convert_uint16_rte(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(short16);
+uint16 __ovld __cnfn convert_uint16_rtz(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(short16);
+uint16 __ovld __cnfn convert_uint16_rtp(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(short16);
+uint16 __ovld __cnfn convert_uint16_rtn(short16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(short16);
+uint16 __ovld __cnfn convert_uint16(short16);
+uint16 __ovld __cnfn convert_uint16_sat(short16);
+uint16 __ovld __cnfn convert_uint16_rte(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtz(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtp(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(ushort16);
+uint16 __ovld __cnfn convert_uint16_rtn(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(ushort16);
+uint16 __ovld __cnfn convert_uint16(ushort16);
+uint16 __ovld __cnfn convert_uint16_sat(ushort16);
+uint16 __ovld __cnfn convert_uint16_rte(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(int16);
+uint16 __ovld __cnfn convert_uint16_rtz(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(int16);
+uint16 __ovld __cnfn convert_uint16_rtp(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(int16);
+uint16 __ovld __cnfn convert_uint16_rtn(int16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(int16);
+uint16 __ovld __cnfn convert_uint16(int16);
+uint16 __ovld __cnfn convert_uint16_sat(int16);
+uint16 __ovld __cnfn convert_uint16_rte(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(uint16);
+uint16 __ovld __cnfn convert_uint16_rtz(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(uint16);
+uint16 __ovld __cnfn convert_uint16_rtp(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(uint16);
+uint16 __ovld __cnfn convert_uint16_rtn(uint16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(uint16);
+uint16 __ovld __cnfn convert_uint16(uint16);
+uint16 __ovld __cnfn convert_uint16_sat(uint16);
+uint16 __ovld __cnfn convert_uint16_rte(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(long16);
+uint16 __ovld __cnfn convert_uint16_rtz(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(long16);
+uint16 __ovld __cnfn convert_uint16_rtp(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(long16);
+uint16 __ovld __cnfn convert_uint16_rtn(long16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(long16);
+uint16 __ovld __cnfn convert_uint16(long16);
+uint16 __ovld __cnfn convert_uint16_sat(long16);
+uint16 __ovld __cnfn convert_uint16_rte(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtz(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtp(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(ulong16);
+uint16 __ovld __cnfn convert_uint16_rtn(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(ulong16);
+uint16 __ovld __cnfn convert_uint16(ulong16);
+uint16 __ovld __cnfn convert_uint16_sat(ulong16);
+uint16 __ovld __cnfn convert_uint16_rte(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(float16);
+uint16 __ovld __cnfn convert_uint16_rtz(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(float16);
+uint16 __ovld __cnfn convert_uint16_rtp(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(float16);
+uint16 __ovld __cnfn convert_uint16_rtn(float16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(float16);
+uint16 __ovld __cnfn convert_uint16(float16);
+uint16 __ovld __cnfn convert_uint16_sat(float16);
+long16 __ovld __cnfn convert_long16_rte(char16);
+long16 __ovld __cnfn convert_long16_sat_rte(char16);
+long16 __ovld __cnfn convert_long16_rtz(char16);
+long16 __ovld __cnfn convert_long16_sat_rtz(char16);
+long16 __ovld __cnfn convert_long16_rtp(char16);
+long16 __ovld __cnfn convert_long16_sat_rtp(char16);
+long16 __ovld __cnfn convert_long16_rtn(char16);
+long16 __ovld __cnfn convert_long16_sat_rtn(char16);
+long16 __ovld __cnfn convert_long16(char16);
+long16 __ovld __cnfn convert_long16_sat(char16);
+long16 __ovld __cnfn convert_long16_rte(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rte(uchar16);
+long16 __ovld __cnfn convert_long16_rtz(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtz(uchar16);
+long16 __ovld __cnfn convert_long16_rtp(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtp(uchar16);
+long16 __ovld __cnfn convert_long16_rtn(uchar16);
+long16 __ovld __cnfn convert_long16_sat_rtn(uchar16);
+long16 __ovld __cnfn convert_long16(uchar16);
+long16 __ovld __cnfn convert_long16_sat(uchar16);
+long16 __ovld __cnfn convert_long16_rte(short16);
+long16 __ovld __cnfn convert_long16_sat_rte(short16);
+long16 __ovld __cnfn convert_long16_rtz(short16);
+long16 __ovld __cnfn convert_long16_sat_rtz(short16);
+long16 __ovld __cnfn convert_long16_rtp(short16);
+long16 __ovld __cnfn convert_long16_sat_rtp(short16);
+long16 __ovld __cnfn convert_long16_rtn(short16);
+long16 __ovld __cnfn convert_long16_sat_rtn(short16);
+long16 __ovld __cnfn convert_long16(short16);
+long16 __ovld __cnfn convert_long16_sat(short16);
+long16 __ovld __cnfn convert_long16_rte(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rte(ushort16);
+long16 __ovld __cnfn convert_long16_rtz(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtz(ushort16);
+long16 __ovld __cnfn convert_long16_rtp(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtp(ushort16);
+long16 __ovld __cnfn convert_long16_rtn(ushort16);
+long16 __ovld __cnfn convert_long16_sat_rtn(ushort16);
+long16 __ovld __cnfn convert_long16(ushort16);
+long16 __ovld __cnfn convert_long16_sat(ushort16);
+long16 __ovld __cnfn convert_long16_rte(int16);
+long16 __ovld __cnfn convert_long16_sat_rte(int16);
+long16 __ovld __cnfn convert_long16_rtz(int16);
+long16 __ovld __cnfn convert_long16_sat_rtz(int16);
+long16 __ovld __cnfn convert_long16_rtp(int16);
+long16 __ovld __cnfn convert_long16_sat_rtp(int16);
+long16 __ovld __cnfn convert_long16_rtn(int16);
+long16 __ovld __cnfn convert_long16_sat_rtn(int16);
+long16 __ovld __cnfn convert_long16(int16);
+long16 __ovld __cnfn convert_long16_sat(int16);
+long16 __ovld __cnfn convert_long16_rte(uint16);
+long16 __ovld __cnfn convert_long16_sat_rte(uint16);
+long16 __ovld __cnfn convert_long16_rtz(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtz(uint16);
+long16 __ovld __cnfn convert_long16_rtp(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtp(uint16);
+long16 __ovld __cnfn convert_long16_rtn(uint16);
+long16 __ovld __cnfn convert_long16_sat_rtn(uint16);
+long16 __ovld __cnfn convert_long16(uint16);
+long16 __ovld __cnfn convert_long16_sat(uint16);
+long16 __ovld __cnfn convert_long16_rte(long16);
+long16 __ovld __cnfn convert_long16_sat_rte(long16);
+long16 __ovld __cnfn convert_long16_rtz(long16);
+long16 __ovld __cnfn convert_long16_sat_rtz(long16);
+long16 __ovld __cnfn convert_long16_rtp(long16);
+long16 __ovld __cnfn convert_long16_sat_rtp(long16);
+long16 __ovld __cnfn convert_long16_rtn(long16);
+long16 __ovld __cnfn convert_long16_sat_rtn(long16);
+long16 __ovld __cnfn convert_long16(long16);
+long16 __ovld __cnfn convert_long16_sat(long16);
+long16 __ovld __cnfn convert_long16_rte(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rte(ulong16);
+long16 __ovld __cnfn convert_long16_rtz(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtz(ulong16);
+long16 __ovld __cnfn convert_long16_rtp(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtp(ulong16);
+long16 __ovld __cnfn convert_long16_rtn(ulong16);
+long16 __ovld __cnfn convert_long16_sat_rtn(ulong16);
+long16 __ovld __cnfn convert_long16(ulong16);
+long16 __ovld __cnfn convert_long16_sat(ulong16);
+long16 __ovld __cnfn convert_long16_rte(float16);
+long16 __ovld __cnfn convert_long16_sat_rte(float16);
+long16 __ovld __cnfn convert_long16_rtz(float16);
+long16 __ovld __cnfn convert_long16_sat_rtz(float16);
+long16 __ovld __cnfn convert_long16_rtp(float16);
+long16 __ovld __cnfn convert_long16_sat_rtp(float16);
+long16 __ovld __cnfn convert_long16_rtn(float16);
+long16 __ovld __cnfn convert_long16_sat_rtn(float16);
+long16 __ovld __cnfn convert_long16(float16);
+long16 __ovld __cnfn convert_long16_sat(float16);
+ulong16 __ovld __cnfn convert_ulong16_rte(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(char16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(char16);
+ulong16 __ovld __cnfn convert_ulong16(char16);
+ulong16 __ovld __cnfn convert_ulong16_sat(char16);
+ulong16 __ovld __cnfn convert_ulong16_rte(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uchar16);
+ulong16 __ovld __cnfn convert_ulong16(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_sat(uchar16);
+ulong16 __ovld __cnfn convert_ulong16_rte(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(short16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(short16);
+ulong16 __ovld __cnfn convert_ulong16(short16);
+ulong16 __ovld __cnfn convert_ulong16_sat(short16);
+ulong16 __ovld __cnfn convert_ulong16_rte(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ushort16);
+ulong16 __ovld __cnfn convert_ulong16(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_sat(ushort16);
+ulong16 __ovld __cnfn convert_ulong16_rte(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(int16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(int16);
+ulong16 __ovld __cnfn convert_ulong16(int16);
+ulong16 __ovld __cnfn convert_ulong16_sat(int16);
+ulong16 __ovld __cnfn convert_ulong16_rte(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(uint16);
+ulong16 __ovld __cnfn convert_ulong16(uint16);
+ulong16 __ovld __cnfn convert_ulong16_sat(uint16);
+ulong16 __ovld __cnfn convert_ulong16_rte(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(long16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(long16);
+ulong16 __ovld __cnfn convert_ulong16(long16);
+ulong16 __ovld __cnfn convert_ulong16_sat(long16);
+ulong16 __ovld __cnfn convert_ulong16_rte(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(ulong16);
+ulong16 __ovld __cnfn convert_ulong16(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_sat(ulong16);
+ulong16 __ovld __cnfn convert_ulong16_rte(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(float16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(float16);
+ulong16 __ovld __cnfn convert_ulong16(float16);
+ulong16 __ovld __cnfn convert_ulong16_sat(float16);
+float16 __ovld __cnfn convert_float16_rte(char16);
+float16 __ovld __cnfn convert_float16_rtz(char16);
+float16 __ovld __cnfn convert_float16_rtp(char16);
+float16 __ovld __cnfn convert_float16_rtn(char16);
+float16 __ovld __cnfn convert_float16(char16);
+float16 __ovld __cnfn convert_float16_rte(uchar16);
+float16 __ovld __cnfn convert_float16_rtz(uchar16);
+float16 __ovld __cnfn convert_float16_rtp(uchar16);
+float16 __ovld __cnfn convert_float16_rtn(uchar16);
+float16 __ovld __cnfn convert_float16(uchar16);
+float16 __ovld __cnfn convert_float16_rte(short16);
+float16 __ovld __cnfn convert_float16_rtz(short16);
+float16 __ovld __cnfn convert_float16_rtp(short16);
+float16 __ovld __cnfn convert_float16_rtn(short16);
+float16 __ovld __cnfn convert_float16(short16);
+float16 __ovld __cnfn convert_float16_rte(ushort16);
+float16 __ovld __cnfn convert_float16_rtz(ushort16);
+float16 __ovld __cnfn convert_float16_rtp(ushort16);
+float16 __ovld __cnfn convert_float16_rtn(ushort16);
+float16 __ovld __cnfn convert_float16(ushort16);
+float16 __ovld __cnfn convert_float16_rte(int16);
+float16 __ovld __cnfn convert_float16_rtz(int16);
+float16 __ovld __cnfn convert_float16_rtp(int16);
+float16 __ovld __cnfn convert_float16_rtn(int16);
+float16 __ovld __cnfn convert_float16(int16);
+float16 __ovld __cnfn convert_float16_rte(uint16);
+float16 __ovld __cnfn convert_float16_rtz(uint16);
+float16 __ovld __cnfn convert_float16_rtp(uint16);
+float16 __ovld __cnfn convert_float16_rtn(uint16);
+float16 __ovld __cnfn convert_float16(uint16);
+float16 __ovld __cnfn convert_float16_rte(long16);
+float16 __ovld __cnfn convert_float16_rtz(long16);
+float16 __ovld __cnfn convert_float16_rtp(long16);
+float16 __ovld __cnfn convert_float16_rtn(long16);
+float16 __ovld __cnfn convert_float16(long16);
+float16 __ovld __cnfn convert_float16_rte(ulong16);
+float16 __ovld __cnfn convert_float16_rtz(ulong16);
+float16 __ovld __cnfn convert_float16_rtp(ulong16);
+float16 __ovld __cnfn convert_float16_rtn(ulong16);
+float16 __ovld __cnfn convert_float16(ulong16);
+float16 __ovld __cnfn convert_float16_rte(float16);
+float16 __ovld __cnfn convert_float16_rtz(float16);
+float16 __ovld __cnfn convert_float16_rtp(float16);
+float16 __ovld __cnfn convert_float16_rtn(float16);
+float16 __ovld __cnfn convert_float16(float16);
+
+// Conversions with double data type parameters or return value.
+
+#ifdef cl_khr_fp64
+char __ovld __cnfn convert_char(double);
+char __ovld __cnfn convert_char_rte(double);
+char __ovld __cnfn convert_char_rtn(double);
+char __ovld __cnfn convert_char_rtp(double);
+char __ovld __cnfn convert_char_rtz(double);
+char __ovld __cnfn convert_char_sat(double);
+char __ovld __cnfn convert_char_sat_rte(double);
+char __ovld __cnfn convert_char_sat_rtn(double);
+char __ovld __cnfn convert_char_sat_rtp(double);
+char __ovld __cnfn convert_char_sat_rtz(double);
+char2 __ovld __cnfn convert_char2(double2);
+char2 __ovld __cnfn convert_char2_rte(double2);
+char2 __ovld __cnfn convert_char2_rtn(double2);
+char2 __ovld __cnfn convert_char2_rtp(double2);
+char2 __ovld __cnfn convert_char2_rtz(double2);
+char2 __ovld __cnfn convert_char2_sat(double2);
+char2 __ovld __cnfn convert_char2_sat_rte(double2);
+char2 __ovld __cnfn convert_char2_sat_rtn(double2);
+char2 __ovld __cnfn convert_char2_sat_rtp(double2);
+char2 __ovld __cnfn convert_char2_sat_rtz(double2);
+char3 __ovld __cnfn convert_char3(double3);
+char3 __ovld __cnfn convert_char3_rte(double3);
+char3 __ovld __cnfn convert_char3_rtn(double3);
+char3 __ovld __cnfn convert_char3_rtp(double3);
+char3 __ovld __cnfn convert_char3_rtz(double3);
+char3 __ovld __cnfn convert_char3_sat(double3);
+char3 __ovld __cnfn convert_char3_sat_rte(double3);
+char3 __ovld __cnfn convert_char3_sat_rtn(double3);
+char3 __ovld __cnfn convert_char3_sat_rtp(double3);
+char3 __ovld __cnfn convert_char3_sat_rtz(double3);
+char4 __ovld __cnfn convert_char4(double4);
+char4 __ovld __cnfn convert_char4_rte(double4);
+char4 __ovld __cnfn convert_char4_rtn(double4);
+char4 __ovld __cnfn convert_char4_rtp(double4);
+char4 __ovld __cnfn convert_char4_rtz(double4);
+char4 __ovld __cnfn convert_char4_sat(double4);
+char4 __ovld __cnfn convert_char4_sat_rte(double4);
+char4 __ovld __cnfn convert_char4_sat_rtn(double4);
+char4 __ovld __cnfn convert_char4_sat_rtp(double4);
+char4 __ovld __cnfn convert_char4_sat_rtz(double4);
+char8 __ovld __cnfn convert_char8(double8);
+char8 __ovld __cnfn convert_char8_rte(double8);
+char8 __ovld __cnfn convert_char8_rtn(double8);
+char8 __ovld __cnfn convert_char8_rtp(double8);
+char8 __ovld __cnfn convert_char8_rtz(double8);
+char8 __ovld __cnfn convert_char8_sat(double8);
+char8 __ovld __cnfn convert_char8_sat_rte(double8);
+char8 __ovld __cnfn convert_char8_sat_rtn(double8);
+char8 __ovld __cnfn convert_char8_sat_rtp(double8);
+char8 __ovld __cnfn convert_char8_sat_rtz(double8);
+char16 __ovld __cnfn convert_char16(double16);
+char16 __ovld __cnfn convert_char16_rte(double16);
+char16 __ovld __cnfn convert_char16_rtn(double16);
+char16 __ovld __cnfn convert_char16_rtp(double16);
+char16 __ovld __cnfn convert_char16_rtz(double16);
+char16 __ovld __cnfn convert_char16_sat(double16);
+char16 __ovld __cnfn convert_char16_sat_rte(double16);
+char16 __ovld __cnfn convert_char16_sat_rtn(double16);
+char16 __ovld __cnfn convert_char16_sat_rtp(double16);
+char16 __ovld __cnfn convert_char16_sat_rtz(double16);
+
+uchar __ovld __cnfn convert_uchar(double);
+uchar __ovld __cnfn convert_uchar_rte(double);
+uchar __ovld __cnfn convert_uchar_rtn(double);
+uchar __ovld __cnfn convert_uchar_rtp(double);
+uchar __ovld __cnfn convert_uchar_rtz(double);
+uchar __ovld __cnfn convert_uchar_sat(double);
+uchar __ovld __cnfn convert_uchar_sat_rte(double);
+uchar __ovld __cnfn convert_uchar_sat_rtn(double);
+uchar __ovld __cnfn convert_uchar_sat_rtp(double);
+uchar __ovld __cnfn convert_uchar_sat_rtz(double);
+uchar2 __ovld __cnfn convert_uchar2(double2);
+uchar2 __ovld __cnfn convert_uchar2_rte(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(double2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(double2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(double2);
+uchar3 __ovld __cnfn convert_uchar3(double3);
+uchar3 __ovld __cnfn convert_uchar3_rte(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(double3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(double3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(double3);
+uchar4 __ovld __cnfn convert_uchar4(double4);
+uchar4 __ovld __cnfn convert_uchar4_rte(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(double4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(double4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(double4);
+uchar8 __ovld __cnfn convert_uchar8(double8);
+uchar8 __ovld __cnfn convert_uchar8_rte(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(double8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(double8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(double8);
+uchar16 __ovld __cnfn convert_uchar16(double16);
+uchar16 __ovld __cnfn convert_uchar16_rte(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(double16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(double16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(double16);
+
+short __ovld __cnfn convert_short(double);
+short __ovld __cnfn convert_short_rte(double);
+short __ovld __cnfn convert_short_rtn(double);
+short __ovld __cnfn convert_short_rtp(double);
+short __ovld __cnfn convert_short_rtz(double);
+short __ovld __cnfn convert_short_sat(double);
+short __ovld __cnfn convert_short_sat_rte(double);
+short __ovld __cnfn convert_short_sat_rtn(double);
+short __ovld __cnfn convert_short_sat_rtp(double);
+short __ovld __cnfn convert_short_sat_rtz(double);
+short2 __ovld __cnfn convert_short2(double2);
+short2 __ovld __cnfn convert_short2_rte(double2);
+short2 __ovld __cnfn convert_short2_rtn(double2);
+short2 __ovld __cnfn convert_short2_rtp(double2);
+short2 __ovld __cnfn convert_short2_rtz(double2);
+short2 __ovld __cnfn convert_short2_sat(double2);
+short2 __ovld __cnfn convert_short2_sat_rte(double2);
+short2 __ovld __cnfn convert_short2_sat_rtn(double2);
+short2 __ovld __cnfn convert_short2_sat_rtp(double2);
+short2 __ovld __cnfn convert_short2_sat_rtz(double2);
+short3 __ovld __cnfn convert_short3(double3);
+short3 __ovld __cnfn convert_short3_rte(double3);
+short3 __ovld __cnfn convert_short3_rtn(double3);
+short3 __ovld __cnfn convert_short3_rtp(double3);
+short3 __ovld __cnfn convert_short3_rtz(double3);
+short3 __ovld __cnfn convert_short3_sat(double3);
+short3 __ovld __cnfn convert_short3_sat_rte(double3);
+short3 __ovld __cnfn convert_short3_sat_rtn(double3);
+short3 __ovld __cnfn convert_short3_sat_rtp(double3);
+short3 __ovld __cnfn convert_short3_sat_rtz(double3);
+short4 __ovld __cnfn convert_short4(double4);
+short4 __ovld __cnfn convert_short4_rte(double4);
+short4 __ovld __cnfn convert_short4_rtn(double4);
+short4 __ovld __cnfn convert_short4_rtp(double4);
+short4 __ovld __cnfn convert_short4_rtz(double4);
+short4 __ovld __cnfn convert_short4_sat(double4);
+short4 __ovld __cnfn convert_short4_sat_rte(double4);
+short4 __ovld __cnfn convert_short4_sat_rtn(double4);
+short4 __ovld __cnfn convert_short4_sat_rtp(double4);
+short4 __ovld __cnfn convert_short4_sat_rtz(double4);
+short8 __ovld __cnfn convert_short8(double8);
+short8 __ovld __cnfn convert_short8_rte(double8);
+short8 __ovld __cnfn convert_short8_rtn(double8);
+short8 __ovld __cnfn convert_short8_rtp(double8);
+short8 __ovld __cnfn convert_short8_rtz(double8);
+short8 __ovld __cnfn convert_short8_sat(double8);
+short8 __ovld __cnfn convert_short8_sat_rte(double8);
+short8 __ovld __cnfn convert_short8_sat_rtn(double8);
+short8 __ovld __cnfn convert_short8_sat_rtp(double8);
+short8 __ovld __cnfn convert_short8_sat_rtz(double8);
+short16 __ovld __cnfn convert_short16(double16);
+short16 __ovld __cnfn convert_short16_rte(double16);
+short16 __ovld __cnfn convert_short16_rtn(double16);
+short16 __ovld __cnfn convert_short16_rtp(double16);
+short16 __ovld __cnfn convert_short16_rtz(double16);
+short16 __ovld __cnfn convert_short16_sat(double16);
+short16 __ovld __cnfn convert_short16_sat_rte(double16);
+short16 __ovld __cnfn convert_short16_sat_rtn(double16);
+short16 __ovld __cnfn convert_short16_sat_rtp(double16);
+short16 __ovld __cnfn convert_short16_sat_rtz(double16);
+
+ushort __ovld __cnfn convert_ushort(double);
+ushort __ovld __cnfn convert_ushort_rte(double);
+ushort __ovld __cnfn convert_ushort_rtn(double);
+ushort __ovld __cnfn convert_ushort_rtp(double);
+ushort __ovld __cnfn convert_ushort_rtz(double);
+ushort __ovld __cnfn convert_ushort_sat(double);
+ushort __ovld __cnfn convert_ushort_sat_rte(double);
+ushort __ovld __cnfn convert_ushort_sat_rtn(double);
+ushort __ovld __cnfn convert_ushort_sat_rtp(double);
+ushort __ovld __cnfn convert_ushort_sat_rtz(double);
+ushort2 __ovld __cnfn convert_ushort2(double2);
+ushort2 __ovld __cnfn convert_ushort2_rte(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(double2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(double2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(double2);
+ushort3 __ovld __cnfn convert_ushort3(double3);
+ushort3 __ovld __cnfn convert_ushort3_rte(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(double3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(double3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(double3);
+ushort4 __ovld __cnfn convert_ushort4(double4);
+ushort4 __ovld __cnfn convert_ushort4_rte(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(double4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(double4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(double4);
+ushort8 __ovld __cnfn convert_ushort8(double8);
+ushort8 __ovld __cnfn convert_ushort8_rte(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(double8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(double8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(double8);
+ushort16 __ovld __cnfn convert_ushort16(double16);
+ushort16 __ovld __cnfn convert_ushort16_rte(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(double16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(double16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(double16);
+
+int __ovld __cnfn convert_int(double);
+int __ovld __cnfn convert_int_rte(double);
+int __ovld __cnfn convert_int_rtn(double);
+int __ovld __cnfn convert_int_rtp(double);
+int __ovld __cnfn convert_int_rtz(double);
+int __ovld __cnfn convert_int_sat(double);
+int __ovld __cnfn convert_int_sat_rte(double);
+int __ovld __cnfn convert_int_sat_rtn(double);
+int __ovld __cnfn convert_int_sat_rtp(double);
+int __ovld __cnfn convert_int_sat_rtz(double);
+int2 __ovld __cnfn convert_int2(double2);
+int2 __ovld __cnfn convert_int2_rte(double2);
+int2 __ovld __cnfn convert_int2_rtn(double2);
+int2 __ovld __cnfn convert_int2_rtp(double2);
+int2 __ovld __cnfn convert_int2_rtz(double2);
+int2 __ovld __cnfn convert_int2_sat(double2);
+int2 __ovld __cnfn convert_int2_sat_rte(double2);
+int2 __ovld __cnfn convert_int2_sat_rtn(double2);
+int2 __ovld __cnfn convert_int2_sat_rtp(double2);
+int2 __ovld __cnfn convert_int2_sat_rtz(double2);
+int3 __ovld __cnfn convert_int3(double3);
+int3 __ovld __cnfn convert_int3_rte(double3);
+int3 __ovld __cnfn convert_int3_rtn(double3);
+int3 __ovld __cnfn convert_int3_rtp(double3);
+int3 __ovld __cnfn convert_int3_rtz(double3);
+int3 __ovld __cnfn convert_int3_sat(double3);
+int3 __ovld __cnfn convert_int3_sat_rte(double3);
+int3 __ovld __cnfn convert_int3_sat_rtn(double3);
+int3 __ovld __cnfn convert_int3_sat_rtp(double3);
+int3 __ovld __cnfn convert_int3_sat_rtz(double3);
+int4 __ovld __cnfn convert_int4(double4);
+int4 __ovld __cnfn convert_int4_rte(double4);
+int4 __ovld __cnfn convert_int4_rtn(double4);
+int4 __ovld __cnfn convert_int4_rtp(double4);
+int4 __ovld __cnfn convert_int4_rtz(double4);
+int4 __ovld __cnfn convert_int4_sat(double4);
+int4 __ovld __cnfn convert_int4_sat_rte(double4);
+int4 __ovld __cnfn convert_int4_sat_rtn(double4);
+int4 __ovld __cnfn convert_int4_sat_rtp(double4);
+int4 __ovld __cnfn convert_int4_sat_rtz(double4);
+int8 __ovld __cnfn convert_int8(double8);
+int8 __ovld __cnfn convert_int8_rte(double8);
+int8 __ovld __cnfn convert_int8_rtn(double8);
+int8 __ovld __cnfn convert_int8_rtp(double8);
+int8 __ovld __cnfn convert_int8_rtz(double8);
+int8 __ovld __cnfn convert_int8_sat(double8);
+int8 __ovld __cnfn convert_int8_sat_rte(double8);
+int8 __ovld __cnfn convert_int8_sat_rtn(double8);
+int8 __ovld __cnfn convert_int8_sat_rtp(double8);
+int8 __ovld __cnfn convert_int8_sat_rtz(double8);
+int16 __ovld __cnfn convert_int16(double16);
+int16 __ovld __cnfn convert_int16_rte(double16);
+int16 __ovld __cnfn convert_int16_rtn(double16);
+int16 __ovld __cnfn convert_int16_rtp(double16);
+int16 __ovld __cnfn convert_int16_rtz(double16);
+int16 __ovld __cnfn convert_int16_sat(double16);
+int16 __ovld __cnfn convert_int16_sat_rte(double16);
+int16 __ovld __cnfn convert_int16_sat_rtn(double16);
+int16 __ovld __cnfn convert_int16_sat_rtp(double16);
+int16 __ovld __cnfn convert_int16_sat_rtz(double16);
+
+uint __ovld __cnfn convert_uint(double);
+uint __ovld __cnfn convert_uint_rte(double);
+uint __ovld __cnfn convert_uint_rtn(double);
+uint __ovld __cnfn convert_uint_rtp(double);
+uint __ovld __cnfn convert_uint_rtz(double);
+uint __ovld __cnfn convert_uint_sat(double);
+uint __ovld __cnfn convert_uint_sat_rte(double);
+uint __ovld __cnfn convert_uint_sat_rtn(double);
+uint __ovld __cnfn convert_uint_sat_rtp(double);
+uint __ovld __cnfn convert_uint_sat_rtz(double);
+uint2 __ovld __cnfn convert_uint2(double2);
+uint2 __ovld __cnfn convert_uint2_rte(double2);
+uint2 __ovld __cnfn convert_uint2_rtn(double2);
+uint2 __ovld __cnfn convert_uint2_rtp(double2);
+uint2 __ovld __cnfn convert_uint2_rtz(double2);
+uint2 __ovld __cnfn convert_uint2_sat(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(double2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(double2);
+uint3 __ovld __cnfn convert_uint3(double3);
+uint3 __ovld __cnfn convert_uint3_rte(double3);
+uint3 __ovld __cnfn convert_uint3_rtn(double3);
+uint3 __ovld __cnfn convert_uint3_rtp(double3);
+uint3 __ovld __cnfn convert_uint3_rtz(double3);
+uint3 __ovld __cnfn convert_uint3_sat(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(double3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(double3);
+uint4 __ovld __cnfn convert_uint4(double4);
+uint4 __ovld __cnfn convert_uint4_rte(double4);
+uint4 __ovld __cnfn convert_uint4_rtn(double4);
+uint4 __ovld __cnfn convert_uint4_rtp(double4);
+uint4 __ovld __cnfn convert_uint4_rtz(double4);
+uint4 __ovld __cnfn convert_uint4_sat(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(double4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(double4);
+uint8 __ovld __cnfn convert_uint8(double8);
+uint8 __ovld __cnfn convert_uint8_rte(double8);
+uint8 __ovld __cnfn convert_uint8_rtn(double8);
+uint8 __ovld __cnfn convert_uint8_rtp(double8);
+uint8 __ovld __cnfn convert_uint8_rtz(double8);
+uint8 __ovld __cnfn convert_uint8_sat(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(double8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(double8);
+uint16 __ovld __cnfn convert_uint16(double16);
+uint16 __ovld __cnfn convert_uint16_rte(double16);
+uint16 __ovld __cnfn convert_uint16_rtn(double16);
+uint16 __ovld __cnfn convert_uint16_rtp(double16);
+uint16 __ovld __cnfn convert_uint16_rtz(double16);
+uint16 __ovld __cnfn convert_uint16_sat(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(double16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(double16);
+
+long __ovld __cnfn convert_long(double);
+long __ovld __cnfn convert_long_rte(double);
+long __ovld __cnfn convert_long_rtn(double);
+long __ovld __cnfn convert_long_rtp(double);
+long __ovld __cnfn convert_long_rtz(double);
+long __ovld __cnfn convert_long_sat(double);
+long __ovld __cnfn convert_long_sat_rte(double);
+long __ovld __cnfn convert_long_sat_rtn(double);
+long __ovld __cnfn convert_long_sat_rtp(double);
+long __ovld __cnfn convert_long_sat_rtz(double);
+long2 __ovld __cnfn convert_long2(double2);
+long2 __ovld __cnfn convert_long2_rte(double2);
+long2 __ovld __cnfn convert_long2_rtn(double2);
+long2 __ovld __cnfn convert_long2_rtp(double2);
+long2 __ovld __cnfn convert_long2_rtz(double2);
+long2 __ovld __cnfn convert_long2_sat(double2);
+long2 __ovld __cnfn convert_long2_sat_rte(double2);
+long2 __ovld __cnfn convert_long2_sat_rtn(double2);
+long2 __ovld __cnfn convert_long2_sat_rtp(double2);
+long2 __ovld __cnfn convert_long2_sat_rtz(double2);
+long3 __ovld __cnfn convert_long3(double3);
+long3 __ovld __cnfn convert_long3_rte(double3);
+long3 __ovld __cnfn convert_long3_rtn(double3);
+long3 __ovld __cnfn convert_long3_rtp(double3);
+long3 __ovld __cnfn convert_long3_rtz(double3);
+long3 __ovld __cnfn convert_long3_sat(double3);
+long3 __ovld __cnfn convert_long3_sat_rte(double3);
+long3 __ovld __cnfn convert_long3_sat_rtn(double3);
+long3 __ovld __cnfn convert_long3_sat_rtp(double3);
+long3 __ovld __cnfn convert_long3_sat_rtz(double3);
+long4 __ovld __cnfn convert_long4(double4);
+long4 __ovld __cnfn convert_long4_rte(double4);
+long4 __ovld __cnfn convert_long4_rtn(double4);
+long4 __ovld __cnfn convert_long4_rtp(double4);
+long4 __ovld __cnfn convert_long4_rtz(double4);
+long4 __ovld __cnfn convert_long4_sat(double4);
+long4 __ovld __cnfn convert_long4_sat_rte(double4);
+long4 __ovld __cnfn convert_long4_sat_rtn(double4);
+long4 __ovld __cnfn convert_long4_sat_rtp(double4);
+long4 __ovld __cnfn convert_long4_sat_rtz(double4);
+long8 __ovld __cnfn convert_long8(double8);
+long8 __ovld __cnfn convert_long8_rte(double8);
+long8 __ovld __cnfn convert_long8_rtn(double8);
+long8 __ovld __cnfn convert_long8_rtp(double8);
+long8 __ovld __cnfn convert_long8_rtz(double8);
+long8 __ovld __cnfn convert_long8_sat(double8);
+long8 __ovld __cnfn convert_long8_sat_rte(double8);
+long8 __ovld __cnfn convert_long8_sat_rtn(double8);
+long8 __ovld __cnfn convert_long8_sat_rtp(double8);
+long8 __ovld __cnfn convert_long8_sat_rtz(double8);
+long16 __ovld __cnfn convert_long16(double16);
+long16 __ovld __cnfn convert_long16_rte(double16);
+long16 __ovld __cnfn convert_long16_rtn(double16);
+long16 __ovld __cnfn convert_long16_rtp(double16);
+long16 __ovld __cnfn convert_long16_rtz(double16);
+long16 __ovld __cnfn convert_long16_sat(double16);
+long16 __ovld __cnfn convert_long16_sat_rte(double16);
+long16 __ovld __cnfn convert_long16_sat_rtn(double16);
+long16 __ovld __cnfn convert_long16_sat_rtp(double16);
+long16 __ovld __cnfn convert_long16_sat_rtz(double16);
+
+ulong __ovld __cnfn convert_ulong(double);
+ulong __ovld __cnfn convert_ulong_rte(double);
+ulong __ovld __cnfn convert_ulong_rtn(double);
+ulong __ovld __cnfn convert_ulong_rtp(double);
+ulong __ovld __cnfn convert_ulong_rtz(double);
+ulong __ovld __cnfn convert_ulong_sat(double);
+ulong __ovld __cnfn convert_ulong_sat_rte(double);
+ulong __ovld __cnfn convert_ulong_sat_rtn(double);
+ulong __ovld __cnfn convert_ulong_sat_rtp(double);
+ulong __ovld __cnfn convert_ulong_sat_rtz(double);
+ulong2 __ovld __cnfn convert_ulong2(double2);
+ulong2 __ovld __cnfn convert_ulong2_rte(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(double2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(double2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(double2);
+ulong3 __ovld __cnfn convert_ulong3(double3);
+ulong3 __ovld __cnfn convert_ulong3_rte(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(double3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(double3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(double3);
+ulong4 __ovld __cnfn convert_ulong4(double4);
+ulong4 __ovld __cnfn convert_ulong4_rte(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(double4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(double4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(double4);
+ulong8 __ovld __cnfn convert_ulong8(double8);
+ulong8 __ovld __cnfn convert_ulong8_rte(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(double8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(double8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(double8);
+ulong16 __ovld __cnfn convert_ulong16(double16);
+ulong16 __ovld __cnfn convert_ulong16_rte(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(double16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(double16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(double16);
+
+float __ovld __cnfn convert_float(double);
+float __ovld __cnfn convert_float_rte(double);
+float __ovld __cnfn convert_float_rtn(double);
+float __ovld __cnfn convert_float_rtp(double);
+float __ovld __cnfn convert_float_rtz(double);
+float2 __ovld __cnfn convert_float2(double2);
+float2 __ovld __cnfn convert_float2_rte(double2);
+float2 __ovld __cnfn convert_float2_rtn(double2);
+float2 __ovld __cnfn convert_float2_rtp(double2);
+float2 __ovld __cnfn convert_float2_rtz(double2);
+float3 __ovld __cnfn convert_float3(double3);
+float3 __ovld __cnfn convert_float3_rte(double3);
+float3 __ovld __cnfn convert_float3_rtn(double3);
+float3 __ovld __cnfn convert_float3_rtp(double3);
+float3 __ovld __cnfn convert_float3_rtz(double3);
+float4 __ovld __cnfn convert_float4(double4);
+float4 __ovld __cnfn convert_float4_rte(double4);
+float4 __ovld __cnfn convert_float4_rtn(double4);
+float4 __ovld __cnfn convert_float4_rtp(double4);
+float4 __ovld __cnfn convert_float4_rtz(double4);
+float8 __ovld __cnfn convert_float8(double8);
+float8 __ovld __cnfn convert_float8_rte(double8);
+float8 __ovld __cnfn convert_float8_rtn(double8);
+float8 __ovld __cnfn convert_float8_rtp(double8);
+float8 __ovld __cnfn convert_float8_rtz(double8);
+float16 __ovld __cnfn convert_float16(double16);
+float16 __ovld __cnfn convert_float16_rte(double16);
+float16 __ovld __cnfn convert_float16_rtn(double16);
+float16 __ovld __cnfn convert_float16_rtp(double16);
+float16 __ovld __cnfn convert_float16_rtz(double16);
+
+double __ovld __cnfn convert_double(char);
+double __ovld __cnfn convert_double(double);
+double __ovld __cnfn convert_double(float);
+double __ovld __cnfn convert_double(int);
+double __ovld __cnfn convert_double(long);
+double __ovld __cnfn convert_double(short);
+double __ovld __cnfn convert_double(uchar);
+double __ovld __cnfn convert_double(uint);
+double __ovld __cnfn convert_double(ulong);
+double __ovld __cnfn convert_double(ushort);
+double __ovld __cnfn convert_double_rte(char);
+double __ovld __cnfn convert_double_rte(double);
+double __ovld __cnfn convert_double_rte(float);
+double __ovld __cnfn convert_double_rte(int);
+double __ovld __cnfn convert_double_rte(long);
+double __ovld __cnfn convert_double_rte(short);
+double __ovld __cnfn convert_double_rte(uchar);
+double __ovld __cnfn convert_double_rte(uint);
+double __ovld __cnfn convert_double_rte(ulong);
+double __ovld __cnfn convert_double_rte(ushort);
+double __ovld __cnfn convert_double_rtn(char);
+double __ovld __cnfn convert_double_rtn(double);
+double __ovld __cnfn convert_double_rtn(float);
+double __ovld __cnfn convert_double_rtn(int);
+double __ovld __cnfn convert_double_rtn(long);
+double __ovld __cnfn convert_double_rtn(short);
+double __ovld __cnfn convert_double_rtn(uchar);
+double __ovld __cnfn convert_double_rtn(uint);
+double __ovld __cnfn convert_double_rtn(ulong);
+double __ovld __cnfn convert_double_rtn(ushort);
+double __ovld __cnfn convert_double_rtp(char);
+double __ovld __cnfn convert_double_rtp(double);
+double __ovld __cnfn convert_double_rtp(float);
+double __ovld __cnfn convert_double_rtp(int);
+double __ovld __cnfn convert_double_rtp(long);
+double __ovld __cnfn convert_double_rtp(short);
+double __ovld __cnfn convert_double_rtp(uchar);
+double __ovld __cnfn convert_double_rtp(uint);
+double __ovld __cnfn convert_double_rtp(ulong);
+double __ovld __cnfn convert_double_rtp(ushort);
+double __ovld __cnfn convert_double_rtz(char);
+double __ovld __cnfn convert_double_rtz(double);
+double __ovld __cnfn convert_double_rtz(float);
+double __ovld __cnfn convert_double_rtz(int);
+double __ovld __cnfn convert_double_rtz(long);
+double __ovld __cnfn convert_double_rtz(short);
+double __ovld __cnfn convert_double_rtz(uchar);
+double __ovld __cnfn convert_double_rtz(uint);
+double __ovld __cnfn convert_double_rtz(ulong);
+double __ovld __cnfn convert_double_rtz(ushort);
+double2 __ovld __cnfn convert_double2(char2);
+double2 __ovld __cnfn convert_double2(double2);
+double2 __ovld __cnfn convert_double2(float2);
+double2 __ovld __cnfn convert_double2(int2);
+double2 __ovld __cnfn convert_double2(long2);
+double2 __ovld __cnfn convert_double2(short2);
+double2 __ovld __cnfn convert_double2(uchar2);
+double2 __ovld __cnfn convert_double2(uint2);
+double2 __ovld __cnfn convert_double2(ulong2);
+double2 __ovld __cnfn convert_double2(ushort2);
+double2 __ovld __cnfn convert_double2_rte(char2);
+double2 __ovld __cnfn convert_double2_rte(double2);
+double2 __ovld __cnfn convert_double2_rte(float2);
+double2 __ovld __cnfn convert_double2_rte(int2);
+double2 __ovld __cnfn convert_double2_rte(long2);
+double2 __ovld __cnfn convert_double2_rte(short2);
+double2 __ovld __cnfn convert_double2_rte(uchar2);
+double2 __ovld __cnfn convert_double2_rte(uint2);
+double2 __ovld __cnfn convert_double2_rte(ulong2);
+double2 __ovld __cnfn convert_double2_rte(ushort2);
+double2 __ovld __cnfn convert_double2_rtn(char2);
+double2 __ovld __cnfn convert_double2_rtn(double2);
+double2 __ovld __cnfn convert_double2_rtn(float2);
+double2 __ovld __cnfn convert_double2_rtn(int2);
+double2 __ovld __cnfn convert_double2_rtn(long2);
+double2 __ovld __cnfn convert_double2_rtn(short2);
+double2 __ovld __cnfn convert_double2_rtn(uchar2);
+double2 __ovld __cnfn convert_double2_rtn(uint2);
+double2 __ovld __cnfn convert_double2_rtn(ulong2);
+double2 __ovld __cnfn convert_double2_rtn(ushort2);
+double2 __ovld __cnfn convert_double2_rtp(char2);
+double2 __ovld __cnfn convert_double2_rtp(double2);
+double2 __ovld __cnfn convert_double2_rtp(float2);
+double2 __ovld __cnfn convert_double2_rtp(int2);
+double2 __ovld __cnfn convert_double2_rtp(long2);
+double2 __ovld __cnfn convert_double2_rtp(short2);
+double2 __ovld __cnfn convert_double2_rtp(uchar2);
+double2 __ovld __cnfn convert_double2_rtp(uint2);
+double2 __ovld __cnfn convert_double2_rtp(ulong2);
+double2 __ovld __cnfn convert_double2_rtp(ushort2);
+double2 __ovld __cnfn convert_double2_rtz(char2);
+double2 __ovld __cnfn convert_double2_rtz(double2);
+double2 __ovld __cnfn convert_double2_rtz(float2);
+double2 __ovld __cnfn convert_double2_rtz(int2);
+double2 __ovld __cnfn convert_double2_rtz(long2);
+double2 __ovld __cnfn convert_double2_rtz(short2);
+double2 __ovld __cnfn convert_double2_rtz(uchar2);
+double2 __ovld __cnfn convert_double2_rtz(uint2);
+double2 __ovld __cnfn convert_double2_rtz(ulong2);
+double2 __ovld __cnfn convert_double2_rtz(ushort2);
+double3 __ovld __cnfn convert_double3(char3);
+double3 __ovld __cnfn convert_double3(double3);
+double3 __ovld __cnfn convert_double3(float3);
+double3 __ovld __cnfn convert_double3(int3);
+double3 __ovld __cnfn convert_double3(long3);
+double3 __ovld __cnfn convert_double3(short3);
+double3 __ovld __cnfn convert_double3(uchar3);
+double3 __ovld __cnfn convert_double3(uint3);
+double3 __ovld __cnfn convert_double3(ulong3);
+double3 __ovld __cnfn convert_double3(ushort3);
+double3 __ovld __cnfn convert_double3_rte(char3);
+double3 __ovld __cnfn convert_double3_rte(double3);
+double3 __ovld __cnfn convert_double3_rte(float3);
+double3 __ovld __cnfn convert_double3_rte(int3);
+double3 __ovld __cnfn convert_double3_rte(long3);
+double3 __ovld __cnfn convert_double3_rte(short3);
+double3 __ovld __cnfn convert_double3_rte(uchar3);
+double3 __ovld __cnfn convert_double3_rte(uint3);
+double3 __ovld __cnfn convert_double3_rte(ulong3);
+double3 __ovld __cnfn convert_double3_rte(ushort3);
+double3 __ovld __cnfn convert_double3_rtn(char3);
+double3 __ovld __cnfn convert_double3_rtn(double3);
+double3 __ovld __cnfn convert_double3_rtn(float3);
+double3 __ovld __cnfn convert_double3_rtn(int3);
+double3 __ovld __cnfn convert_double3_rtn(long3);
+double3 __ovld __cnfn convert_double3_rtn(short3);
+double3 __ovld __cnfn convert_double3_rtn(uchar3);
+double3 __ovld __cnfn convert_double3_rtn(uint3);
+double3 __ovld __cnfn convert_double3_rtn(ulong3);
+double3 __ovld __cnfn convert_double3_rtn(ushort3);
+double3 __ovld __cnfn convert_double3_rtp(char3);
+double3 __ovld __cnfn convert_double3_rtp(double3);
+double3 __ovld __cnfn convert_double3_rtp(float3);
+double3 __ovld __cnfn convert_double3_rtp(int3);
+double3 __ovld __cnfn convert_double3_rtp(long3);
+double3 __ovld __cnfn convert_double3_rtp(short3);
+double3 __ovld __cnfn convert_double3_rtp(uchar3);
+double3 __ovld __cnfn convert_double3_rtp(uint3);
+double3 __ovld __cnfn convert_double3_rtp(ulong3);
+double3 __ovld __cnfn convert_double3_rtp(ushort3);
+double3 __ovld __cnfn convert_double3_rtz(char3);
+double3 __ovld __cnfn convert_double3_rtz(double3);
+double3 __ovld __cnfn convert_double3_rtz(float3);
+double3 __ovld __cnfn convert_double3_rtz(int3);
+double3 __ovld __cnfn convert_double3_rtz(long3);
+double3 __ovld __cnfn convert_double3_rtz(short3);
+double3 __ovld __cnfn convert_double3_rtz(uchar3);
+double3 __ovld __cnfn convert_double3_rtz(uint3);
+double3 __ovld __cnfn convert_double3_rtz(ulong3);
+double3 __ovld __cnfn convert_double3_rtz(ushort3);
+double4 __ovld __cnfn convert_double4(char4);
+double4 __ovld __cnfn convert_double4(double4);
+double4 __ovld __cnfn convert_double4(float4);
+double4 __ovld __cnfn convert_double4(int4);
+double4 __ovld __cnfn convert_double4(long4);
+double4 __ovld __cnfn convert_double4(short4);
+double4 __ovld __cnfn convert_double4(uchar4);
+double4 __ovld __cnfn convert_double4(uint4);
+double4 __ovld __cnfn convert_double4(ulong4);
+double4 __ovld __cnfn convert_double4(ushort4);
+double4 __ovld __cnfn convert_double4_rte(char4);
+double4 __ovld __cnfn convert_double4_rte(double4);
+double4 __ovld __cnfn convert_double4_rte(float4);
+double4 __ovld __cnfn convert_double4_rte(int4);
+double4 __ovld __cnfn convert_double4_rte(long4);
+double4 __ovld __cnfn convert_double4_rte(short4);
+double4 __ovld __cnfn convert_double4_rte(uchar4);
+double4 __ovld __cnfn convert_double4_rte(uint4);
+double4 __ovld __cnfn convert_double4_rte(ulong4);
+double4 __ovld __cnfn convert_double4_rte(ushort4);
+double4 __ovld __cnfn convert_double4_rtn(char4);
+double4 __ovld __cnfn convert_double4_rtn(double4);
+double4 __ovld __cnfn convert_double4_rtn(float4);
+double4 __ovld __cnfn convert_double4_rtn(int4);
+double4 __ovld __cnfn convert_double4_rtn(long4);
+double4 __ovld __cnfn convert_double4_rtn(short4);
+double4 __ovld __cnfn convert_double4_rtn(uchar4);
+double4 __ovld __cnfn convert_double4_rtn(uint4);
+double4 __ovld __cnfn convert_double4_rtn(ulong4);
+double4 __ovld __cnfn convert_double4_rtn(ushort4);
+double4 __ovld __cnfn convert_double4_rtp(char4);
+double4 __ovld __cnfn convert_double4_rtp(double4);
+double4 __ovld __cnfn convert_double4_rtp(float4);
+double4 __ovld __cnfn convert_double4_rtp(int4);
+double4 __ovld __cnfn convert_double4_rtp(long4);
+double4 __ovld __cnfn convert_double4_rtp(short4);
+double4 __ovld __cnfn convert_double4_rtp(uchar4);
+double4 __ovld __cnfn convert_double4_rtp(uint4);
+double4 __ovld __cnfn convert_double4_rtp(ulong4);
+double4 __ovld __cnfn convert_double4_rtp(ushort4);
+double4 __ovld __cnfn convert_double4_rtz(char4);
+double4 __ovld __cnfn convert_double4_rtz(double4);
+double4 __ovld __cnfn convert_double4_rtz(float4);
+double4 __ovld __cnfn convert_double4_rtz(int4);
+double4 __ovld __cnfn convert_double4_rtz(long4);
+double4 __ovld __cnfn convert_double4_rtz(short4);
+double4 __ovld __cnfn convert_double4_rtz(uchar4);
+double4 __ovld __cnfn convert_double4_rtz(uint4);
+double4 __ovld __cnfn convert_double4_rtz(ulong4);
+double4 __ovld __cnfn convert_double4_rtz(ushort4);
+double8 __ovld __cnfn convert_double8(char8);
+double8 __ovld __cnfn convert_double8(double8);
+double8 __ovld __cnfn convert_double8(float8);
+double8 __ovld __cnfn convert_double8(int8);
+double8 __ovld __cnfn convert_double8(long8);
+double8 __ovld __cnfn convert_double8(short8);
+double8 __ovld __cnfn convert_double8(uchar8);
+double8 __ovld __cnfn convert_double8(uint8);
+double8 __ovld __cnfn convert_double8(ulong8);
+double8 __ovld __cnfn convert_double8(ushort8);
+double8 __ovld __cnfn convert_double8_rte(char8);
+double8 __ovld __cnfn convert_double8_rte(double8);
+double8 __ovld __cnfn convert_double8_rte(float8);
+double8 __ovld __cnfn convert_double8_rte(int8);
+double8 __ovld __cnfn convert_double8_rte(long8);
+double8 __ovld __cnfn convert_double8_rte(short8);
+double8 __ovld __cnfn convert_double8_rte(uchar8);
+double8 __ovld __cnfn convert_double8_rte(uint8);
+double8 __ovld __cnfn convert_double8_rte(ulong8);
+double8 __ovld __cnfn convert_double8_rte(ushort8);
+double8 __ovld __cnfn convert_double8_rtn(char8);
+double8 __ovld __cnfn convert_double8_rtn(double8);
+double8 __ovld __cnfn convert_double8_rtn(float8);
+double8 __ovld __cnfn convert_double8_rtn(int8);
+double8 __ovld __cnfn convert_double8_rtn(long8);
+double8 __ovld __cnfn convert_double8_rtn(short8);
+double8 __ovld __cnfn convert_double8_rtn(uchar8);
+double8 __ovld __cnfn convert_double8_rtn(uint8);
+double8 __ovld __cnfn convert_double8_rtn(ulong8);
+double8 __ovld __cnfn convert_double8_rtn(ushort8);
+double8 __ovld __cnfn convert_double8_rtp(char8);
+double8 __ovld __cnfn convert_double8_rtp(double8);
+double8 __ovld __cnfn convert_double8_rtp(float8);
+double8 __ovld __cnfn convert_double8_rtp(int8);
+double8 __ovld __cnfn convert_double8_rtp(long8);
+double8 __ovld __cnfn convert_double8_rtp(short8);
+double8 __ovld __cnfn convert_double8_rtp(uchar8);
+double8 __ovld __cnfn convert_double8_rtp(uint8);
+double8 __ovld __cnfn convert_double8_rtp(ulong8);
+double8 __ovld __cnfn convert_double8_rtp(ushort8);
+double8 __ovld __cnfn convert_double8_rtz(char8);
+double8 __ovld __cnfn convert_double8_rtz(double8);
+double8 __ovld __cnfn convert_double8_rtz(float8);
+double8 __ovld __cnfn convert_double8_rtz(int8);
+double8 __ovld __cnfn convert_double8_rtz(long8);
+double8 __ovld __cnfn convert_double8_rtz(short8);
+double8 __ovld __cnfn convert_double8_rtz(uchar8);
+double8 __ovld __cnfn convert_double8_rtz(uint8);
+double8 __ovld __cnfn convert_double8_rtz(ulong8);
+double8 __ovld __cnfn convert_double8_rtz(ushort8);
+double16 __ovld __cnfn convert_double16(char16);
+double16 __ovld __cnfn convert_double16(double16);
+double16 __ovld __cnfn convert_double16(float16);
+double16 __ovld __cnfn convert_double16(int16);
+double16 __ovld __cnfn convert_double16(long16);
+double16 __ovld __cnfn convert_double16(short16);
+double16 __ovld __cnfn convert_double16(uchar16);
+double16 __ovld __cnfn convert_double16(uint16);
+double16 __ovld __cnfn convert_double16(ulong16);
+double16 __ovld __cnfn convert_double16(ushort16);
+double16 __ovld __cnfn convert_double16_rte(char16);
+double16 __ovld __cnfn convert_double16_rte(double16);
+double16 __ovld __cnfn convert_double16_rte(float16);
+double16 __ovld __cnfn convert_double16_rte(int16);
+double16 __ovld __cnfn convert_double16_rte(long16);
+double16 __ovld __cnfn convert_double16_rte(short16);
+double16 __ovld __cnfn convert_double16_rte(uchar16);
+double16 __ovld __cnfn convert_double16_rte(uint16);
+double16 __ovld __cnfn convert_double16_rte(ulong16);
+double16 __ovld __cnfn convert_double16_rte(ushort16);
+double16 __ovld __cnfn convert_double16_rtn(char16);
+double16 __ovld __cnfn convert_double16_rtn(double16);
+double16 __ovld __cnfn convert_double16_rtn(float16);
+double16 __ovld __cnfn convert_double16_rtn(int16);
+double16 __ovld __cnfn convert_double16_rtn(long16);
+double16 __ovld __cnfn convert_double16_rtn(short16);
+double16 __ovld __cnfn convert_double16_rtn(uchar16);
+double16 __ovld __cnfn convert_double16_rtn(uint16);
+double16 __ovld __cnfn convert_double16_rtn(ulong16);
+double16 __ovld __cnfn convert_double16_rtn(ushort16);
+double16 __ovld __cnfn convert_double16_rtp(char16);
+double16 __ovld __cnfn convert_double16_rtp(double16);
+double16 __ovld __cnfn convert_double16_rtp(float16);
+double16 __ovld __cnfn convert_double16_rtp(int16);
+double16 __ovld __cnfn convert_double16_rtp(long16);
+double16 __ovld __cnfn convert_double16_rtp(short16);
+double16 __ovld __cnfn convert_double16_rtp(uchar16);
+double16 __ovld __cnfn convert_double16_rtp(uint16);
+double16 __ovld __cnfn convert_double16_rtp(ulong16);
+double16 __ovld __cnfn convert_double16_rtp(ushort16);
+double16 __ovld __cnfn convert_double16_rtz(char16);
+double16 __ovld __cnfn convert_double16_rtz(double16);
+double16 __ovld __cnfn convert_double16_rtz(float16);
+double16 __ovld __cnfn convert_double16_rtz(int16);
+double16 __ovld __cnfn convert_double16_rtz(long16);
+double16 __ovld __cnfn convert_double16_rtz(short16);
+double16 __ovld __cnfn convert_double16_rtz(uchar16);
+double16 __ovld __cnfn convert_double16_rtz(uint16);
+double16 __ovld __cnfn convert_double16_rtz(ulong16);
+double16 __ovld __cnfn convert_double16_rtz(ushort16);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+// Convert half types to non-double types.
+uchar __ovld __cnfn convert_uchar(half);
+uchar __ovld __cnfn convert_uchar_rte(half);
+uchar __ovld __cnfn convert_uchar_rtp(half);
+uchar __ovld __cnfn convert_uchar_rtn(half);
+uchar __ovld __cnfn convert_uchar_rtz(half);
+uchar __ovld __cnfn convert_uchar_sat(half);
+uchar __ovld __cnfn convert_uchar_sat_rte(half);
+uchar __ovld __cnfn convert_uchar_sat_rtp(half);
+uchar __ovld __cnfn convert_uchar_sat_rtn(half);
+uchar __ovld __cnfn convert_uchar_sat_rtz(half);
+uchar2 __ovld __cnfn convert_uchar2(half2);
+uchar2 __ovld __cnfn convert_uchar2_rte(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtp(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtn(half2);
+uchar2 __ovld __cnfn convert_uchar2_rtz(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rte(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtp(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtn(half2);
+uchar2 __ovld __cnfn convert_uchar2_sat_rtz(half2);
+uchar3 __ovld __cnfn convert_uchar3(half3);
+uchar3 __ovld __cnfn convert_uchar3_rte(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtp(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtn(half3);
+uchar3 __ovld __cnfn convert_uchar3_rtz(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rte(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtp(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtn(half3);
+uchar3 __ovld __cnfn convert_uchar3_sat_rtz(half3);
+uchar4 __ovld __cnfn convert_uchar4(half4);
+uchar4 __ovld __cnfn convert_uchar4_rte(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtp(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtn(half4);
+uchar4 __ovld __cnfn convert_uchar4_rtz(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rte(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtp(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtn(half4);
+uchar4 __ovld __cnfn convert_uchar4_sat_rtz(half4);
+uchar8 __ovld __cnfn convert_uchar8(half8);
+uchar8 __ovld __cnfn convert_uchar8_rte(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtp(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtn(half8);
+uchar8 __ovld __cnfn convert_uchar8_rtz(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rte(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtp(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtn(half8);
+uchar8 __ovld __cnfn convert_uchar8_sat_rtz(half8);
+uchar16 __ovld __cnfn convert_uchar16(half16);
+uchar16 __ovld __cnfn convert_uchar16_rte(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtp(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtn(half16);
+uchar16 __ovld __cnfn convert_uchar16_rtz(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rte(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtp(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtn(half16);
+uchar16 __ovld __cnfn convert_uchar16_sat_rtz(half16);
+ushort __ovld __cnfn convert_ushort(half);
+ushort __ovld __cnfn convert_ushort_rte(half);
+ushort __ovld __cnfn convert_ushort_rtp(half);
+ushort __ovld __cnfn convert_ushort_rtn(half);
+ushort __ovld __cnfn convert_ushort_rtz(half);
+ushort __ovld __cnfn convert_ushort_sat(half);
+ushort __ovld __cnfn convert_ushort_sat_rte(half);
+ushort __ovld __cnfn convert_ushort_sat_rtp(half);
+ushort __ovld __cnfn convert_ushort_sat_rtn(half);
+ushort __ovld __cnfn convert_ushort_sat_rtz(half);
+ushort2 __ovld __cnfn convert_ushort2(half2);
+ushort2 __ovld __cnfn convert_ushort2_rte(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtp(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtn(half2);
+ushort2 __ovld __cnfn convert_ushort2_rtz(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rte(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtp(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtn(half2);
+ushort2 __ovld __cnfn convert_ushort2_sat_rtz(half2);
+ushort3 __ovld __cnfn convert_ushort3(half3);
+ushort3 __ovld __cnfn convert_ushort3_rte(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtp(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtn(half3);
+ushort3 __ovld __cnfn convert_ushort3_rtz(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rte(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtp(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtn(half3);
+ushort3 __ovld __cnfn convert_ushort3_sat_rtz(half3);
+ushort4 __ovld __cnfn convert_ushort4(half4);
+ushort4 __ovld __cnfn convert_ushort4_rte(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtp(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtn(half4);
+ushort4 __ovld __cnfn convert_ushort4_rtz(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rte(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtp(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtn(half4);
+ushort4 __ovld __cnfn convert_ushort4_sat_rtz(half4);
+ushort8 __ovld __cnfn convert_ushort8(half8);
+ushort8 __ovld __cnfn convert_ushort8_rte(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtp(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtn(half8);
+ushort8 __ovld __cnfn convert_ushort8_rtz(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rte(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtp(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtn(half8);
+ushort8 __ovld __cnfn convert_ushort8_sat_rtz(half8);
+ushort16 __ovld __cnfn convert_ushort16(half16);
+ushort16 __ovld __cnfn convert_ushort16_rte(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtp(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtn(half16);
+ushort16 __ovld __cnfn convert_ushort16_rtz(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rte(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtp(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtn(half16);
+ushort16 __ovld __cnfn convert_ushort16_sat_rtz(half16);
+uint __ovld __cnfn convert_uint(half);
+uint __ovld __cnfn convert_uint_rte(half);
+uint __ovld __cnfn convert_uint_rtp(half);
+uint __ovld __cnfn convert_uint_rtn(half);
+uint __ovld __cnfn convert_uint_rtz(half);
+uint __ovld __cnfn convert_uint_sat(half);
+uint __ovld __cnfn convert_uint_sat_rte(half);
+uint __ovld __cnfn convert_uint_sat_rtp(half);
+uint __ovld __cnfn convert_uint_sat_rtn(half);
+uint __ovld __cnfn convert_uint_sat_rtz(half);
+uint2 __ovld __cnfn convert_uint2(half2);
+uint2 __ovld __cnfn convert_uint2_rte(half2);
+uint2 __ovld __cnfn convert_uint2_rtp(half2);
+uint2 __ovld __cnfn convert_uint2_rtn(half2);
+uint2 __ovld __cnfn convert_uint2_rtz(half2);
+uint2 __ovld __cnfn convert_uint2_sat(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rte(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtp(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtn(half2);
+uint2 __ovld __cnfn convert_uint2_sat_rtz(half2);
+uint3 __ovld __cnfn convert_uint3(half3);
+uint3 __ovld __cnfn convert_uint3_rte(half3);
+uint3 __ovld __cnfn convert_uint3_rtp(half3);
+uint3 __ovld __cnfn convert_uint3_rtn(half3);
+uint3 __ovld __cnfn convert_uint3_rtz(half3);
+uint3 __ovld __cnfn convert_uint3_sat(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rte(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtp(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtn(half3);
+uint3 __ovld __cnfn convert_uint3_sat_rtz(half3);
+uint4 __ovld __cnfn convert_uint4(half4);
+uint4 __ovld __cnfn convert_uint4_rte(half4);
+uint4 __ovld __cnfn convert_uint4_rtp(half4);
+uint4 __ovld __cnfn convert_uint4_rtn(half4);
+uint4 __ovld __cnfn convert_uint4_rtz(half4);
+uint4 __ovld __cnfn convert_uint4_sat(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rte(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtp(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtn(half4);
+uint4 __ovld __cnfn convert_uint4_sat_rtz(half4);
+uint8 __ovld __cnfn convert_uint8(half8);
+uint8 __ovld __cnfn convert_uint8_rte(half8);
+uint8 __ovld __cnfn convert_uint8_rtp(half8);
+uint8 __ovld __cnfn convert_uint8_rtn(half8);
+uint8 __ovld __cnfn convert_uint8_rtz(half8);
+uint8 __ovld __cnfn convert_uint8_sat(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rte(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtp(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtn(half8);
+uint8 __ovld __cnfn convert_uint8_sat_rtz(half8);
+uint16 __ovld __cnfn convert_uint16(half16);
+uint16 __ovld __cnfn convert_uint16_rte(half16);
+uint16 __ovld __cnfn convert_uint16_rtp(half16);
+uint16 __ovld __cnfn convert_uint16_rtn(half16);
+uint16 __ovld __cnfn convert_uint16_rtz(half16);
+uint16 __ovld __cnfn convert_uint16_sat(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rte(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtp(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtn(half16);
+uint16 __ovld __cnfn convert_uint16_sat_rtz(half16);
+ulong __ovld __cnfn convert_ulong(half);
+ulong __ovld __cnfn convert_ulong_rte(half);
+ulong __ovld __cnfn convert_ulong_rtp(half);
+ulong __ovld __cnfn convert_ulong_rtn(half);
+ulong __ovld __cnfn convert_ulong_rtz(half);
+ulong __ovld __cnfn convert_ulong_sat(half);
+ulong __ovld __cnfn convert_ulong_sat_rte(half);
+ulong __ovld __cnfn convert_ulong_sat_rtp(half);
+ulong __ovld __cnfn convert_ulong_sat_rtn(half);
+ulong __ovld __cnfn convert_ulong_sat_rtz(half);
+ulong2 __ovld __cnfn convert_ulong2(half2);
+ulong2 __ovld __cnfn convert_ulong2_rte(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtp(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtn(half2);
+ulong2 __ovld __cnfn convert_ulong2_rtz(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rte(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtp(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtn(half2);
+ulong2 __ovld __cnfn convert_ulong2_sat_rtz(half2);
+ulong3 __ovld __cnfn convert_ulong3(half3);
+ulong3 __ovld __cnfn convert_ulong3_rte(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtp(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtn(half3);
+ulong3 __ovld __cnfn convert_ulong3_rtz(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rte(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtp(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtn(half3);
+ulong3 __ovld __cnfn convert_ulong3_sat_rtz(half3);
+ulong4 __ovld __cnfn convert_ulong4(half4);
+ulong4 __ovld __cnfn convert_ulong4_rte(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtp(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtn(half4);
+ulong4 __ovld __cnfn convert_ulong4_rtz(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rte(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtp(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtn(half4);
+ulong4 __ovld __cnfn convert_ulong4_sat_rtz(half4);
+ulong8 __ovld __cnfn convert_ulong8(half8);
+ulong8 __ovld __cnfn convert_ulong8_rte(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtp(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtn(half8);
+ulong8 __ovld __cnfn convert_ulong8_rtz(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rte(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtp(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtn(half8);
+ulong8 __ovld __cnfn convert_ulong8_sat_rtz(half8);
+ulong16 __ovld __cnfn convert_ulong16(half16);
+ulong16 __ovld __cnfn convert_ulong16_rte(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtp(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtn(half16);
+ulong16 __ovld __cnfn convert_ulong16_rtz(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rte(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtp(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtn(half16);
+ulong16 __ovld __cnfn convert_ulong16_sat_rtz(half16);
+char __ovld __cnfn convert_char(half);
+char __ovld __cnfn convert_char_rte(half);
+char __ovld __cnfn convert_char_rtp(half);
+char __ovld __cnfn convert_char_rtn(half);
+char __ovld __cnfn convert_char_rtz(half);
+char __ovld __cnfn convert_char_sat(half);
+char __ovld __cnfn convert_char_sat_rte(half);
+char __ovld __cnfn convert_char_sat_rtp(half);
+char __ovld __cnfn convert_char_sat_rtn(half);
+char __ovld __cnfn convert_char_sat_rtz(half);
+char2 __ovld __cnfn convert_char2(half2);
+char2 __ovld __cnfn convert_char2_rte(half2);
+char2 __ovld __cnfn convert_char2_rtp(half2);
+char2 __ovld __cnfn convert_char2_rtn(half2);
+char2 __ovld __cnfn convert_char2_rtz(half2);
+char2 __ovld __cnfn convert_char2_sat(half2);
+char2 __ovld __cnfn convert_char2_sat_rte(half2);
+char2 __ovld __cnfn convert_char2_sat_rtp(half2);
+char2 __ovld __cnfn convert_char2_sat_rtn(half2);
+char2 __ovld __cnfn convert_char2_sat_rtz(half2);
+char3 __ovld __cnfn convert_char3(half3);
+char3 __ovld __cnfn convert_char3_rte(half3);
+char3 __ovld __cnfn convert_char3_rtp(half3);
+char3 __ovld __cnfn convert_char3_rtn(half3);
+char3 __ovld __cnfn convert_char3_rtz(half3);
+char3 __ovld __cnfn convert_char3_sat(half3);
+char3 __ovld __cnfn convert_char3_sat_rte(half3);
+char3 __ovld __cnfn convert_char3_sat_rtp(half3);
+char3 __ovld __cnfn convert_char3_sat_rtn(half3);
+char3 __ovld __cnfn convert_char3_sat_rtz(half3);
+char4 __ovld __cnfn convert_char4(half4);
+char4 __ovld __cnfn convert_char4_rte(half4);
+char4 __ovld __cnfn convert_char4_rtp(half4);
+char4 __ovld __cnfn convert_char4_rtn(half4);
+char4 __ovld __cnfn convert_char4_rtz(half4);
+char4 __ovld __cnfn convert_char4_sat(half4);
+char4 __ovld __cnfn convert_char4_sat_rte(half4);
+char4 __ovld __cnfn convert_char4_sat_rtp(half4);
+char4 __ovld __cnfn convert_char4_sat_rtn(half4);
+char4 __ovld __cnfn convert_char4_sat_rtz(half4);
+char8 __ovld __cnfn convert_char8(half8);
+char8 __ovld __cnfn convert_char8_rte(half8);
+char8 __ovld __cnfn convert_char8_rtp(half8);
+char8 __ovld __cnfn convert_char8_rtn(half8);
+char8 __ovld __cnfn convert_char8_rtz(half8);
+char8 __ovld __cnfn convert_char8_sat(half8);
+char8 __ovld __cnfn convert_char8_sat_rte(half8);
+char8 __ovld __cnfn convert_char8_sat_rtp(half8);
+char8 __ovld __cnfn convert_char8_sat_rtn(half8);
+char8 __ovld __cnfn convert_char8_sat_rtz(half8);
+char16 __ovld __cnfn convert_char16(half16);
+char16 __ovld __cnfn convert_char16_rte(half16);
+char16 __ovld __cnfn convert_char16_rtp(half16);
+char16 __ovld __cnfn convert_char16_rtn(half16);
+char16 __ovld __cnfn convert_char16_rtz(half16);
+char16 __ovld __cnfn convert_char16_sat(half16);
+char16 __ovld __cnfn convert_char16_sat_rte(half16);
+char16 __ovld __cnfn convert_char16_sat_rtp(half16);
+char16 __ovld __cnfn convert_char16_sat_rtn(half16);
+char16 __ovld __cnfn convert_char16_sat_rtz(half16);
+short __ovld __cnfn convert_short(half);
+short __ovld __cnfn convert_short_rte(half);
+short __ovld __cnfn convert_short_rtp(half);
+short __ovld __cnfn convert_short_rtn(half);
+short __ovld __cnfn convert_short_rtz(half);
+short __ovld __cnfn convert_short_sat(half);
+short __ovld __cnfn convert_short_sat_rte(half);
+short __ovld __cnfn convert_short_sat_rtp(half);
+short __ovld __cnfn convert_short_sat_rtn(half);
+short __ovld __cnfn convert_short_sat_rtz(half);
+short2 __ovld __cnfn convert_short2(half2);
+short2 __ovld __cnfn convert_short2_rte(half2);
+short2 __ovld __cnfn convert_short2_rtp(half2);
+short2 __ovld __cnfn convert_short2_rtn(half2);
+short2 __ovld __cnfn convert_short2_rtz(half2);
+short2 __ovld __cnfn convert_short2_sat(half2);
+short2 __ovld __cnfn convert_short2_sat_rte(half2);
+short2 __ovld __cnfn convert_short2_sat_rtp(half2);
+short2 __ovld __cnfn convert_short2_sat_rtn(half2);
+short2 __ovld __cnfn convert_short2_sat_rtz(half2);
+short3 __ovld __cnfn convert_short3(half3);
+short3 __ovld __cnfn convert_short3_rte(half3);
+short3 __ovld __cnfn convert_short3_rtp(half3);
+short3 __ovld __cnfn convert_short3_rtn(half3);
+short3 __ovld __cnfn convert_short3_rtz(half3);
+short3 __ovld __cnfn convert_short3_sat(half3);
+short3 __ovld __cnfn convert_short3_sat_rte(half3);
+short3 __ovld __cnfn convert_short3_sat_rtp(half3);
+short3 __ovld __cnfn convert_short3_sat_rtn(half3);
+short3 __ovld __cnfn convert_short3_sat_rtz(half3);
+short4 __ovld __cnfn convert_short4(half4);
+short4 __ovld __cnfn convert_short4_rte(half4);
+short4 __ovld __cnfn convert_short4_rtp(half4);
+short4 __ovld __cnfn convert_short4_rtn(half4);
+short4 __ovld __cnfn convert_short4_rtz(half4);
+short4 __ovld __cnfn convert_short4_sat(half4);
+short4 __ovld __cnfn convert_short4_sat_rte(half4);
+short4 __ovld __cnfn convert_short4_sat_rtp(half4);
+short4 __ovld __cnfn convert_short4_sat_rtn(half4);
+short4 __ovld __cnfn convert_short4_sat_rtz(half4);
+short8 __ovld __cnfn convert_short8(half8);
+short8 __ovld __cnfn convert_short8_rte(half8);
+short8 __ovld __cnfn convert_short8_rtp(half8);
+short8 __ovld __cnfn convert_short8_rtn(half8);
+short8 __ovld __cnfn convert_short8_rtz(half8);
+short8 __ovld __cnfn convert_short8_sat(half8);
+short8 __ovld __cnfn convert_short8_sat_rte(half8);
+short8 __ovld __cnfn convert_short8_sat_rtp(half8);
+short8 __ovld __cnfn convert_short8_sat_rtn(half8);
+short8 __ovld __cnfn convert_short8_sat_rtz(half8);
+short16 __ovld __cnfn convert_short16(half16);
+short16 __ovld __cnfn convert_short16_rte(half16);
+short16 __ovld __cnfn convert_short16_rtp(half16);
+short16 __ovld __cnfn convert_short16_rtn(half16);
+short16 __ovld __cnfn convert_short16_rtz(half16);
+short16 __ovld __cnfn convert_short16_sat(half16);
+short16 __ovld __cnfn convert_short16_sat_rte(half16);
+short16 __ovld __cnfn convert_short16_sat_rtp(half16);
+short16 __ovld __cnfn convert_short16_sat_rtn(half16);
+short16 __ovld __cnfn convert_short16_sat_rtz(half16);
+int __ovld __cnfn convert_int(half);
+int __ovld __cnfn convert_int_rte(half);
+int __ovld __cnfn convert_int_rtp(half);
+int __ovld __cnfn convert_int_rtn(half);
+int __ovld __cnfn convert_int_rtz(half);
+int __ovld __cnfn convert_int_sat(half);
+int __ovld __cnfn convert_int_sat_rte(half);
+int __ovld __cnfn convert_int_sat_rtp(half);
+int __ovld __cnfn convert_int_sat_rtn(half);
+int __ovld __cnfn convert_int_sat_rtz(half);
+int2 __ovld __cnfn convert_int2(half2);
+int2 __ovld __cnfn convert_int2_rte(half2);
+int2 __ovld __cnfn convert_int2_rtp(half2);
+int2 __ovld __cnfn convert_int2_rtn(half2);
+int2 __ovld __cnfn convert_int2_rtz(half2);
+int2 __ovld __cnfn convert_int2_sat(half2);
+int2 __ovld __cnfn convert_int2_sat_rte(half2);
+int2 __ovld __cnfn convert_int2_sat_rtp(half2);
+int2 __ovld __cnfn convert_int2_sat_rtn(half2);
+int2 __ovld __cnfn convert_int2_sat_rtz(half2);
+int3 __ovld __cnfn convert_int3(half3);
+int3 __ovld __cnfn convert_int3_rte(half3);
+int3 __ovld __cnfn convert_int3_rtp(half3);
+int3 __ovld __cnfn convert_int3_rtn(half3);
+int3 __ovld __cnfn convert_int3_rtz(half3);
+int3 __ovld __cnfn convert_int3_sat(half3);
+int3 __ovld __cnfn convert_int3_sat_rte(half3);
+int3 __ovld __cnfn convert_int3_sat_rtp(half3);
+int3 __ovld __cnfn convert_int3_sat_rtn(half3);
+int3 __ovld __cnfn convert_int3_sat_rtz(half3);
+int4 __ovld __cnfn convert_int4(half4);
+int4 __ovld __cnfn convert_int4_rte(half4);
+int4 __ovld __cnfn convert_int4_rtp(half4);
+int4 __ovld __cnfn convert_int4_rtn(half4);
+int4 __ovld __cnfn convert_int4_rtz(half4);
+int4 __ovld __cnfn convert_int4_sat(half4);
+int4 __ovld __cnfn convert_int4_sat_rte(half4);
+int4 __ovld __cnfn convert_int4_sat_rtp(half4);
+int4 __ovld __cnfn convert_int4_sat_rtn(half4);
+int4 __ovld __cnfn convert_int4_sat_rtz(half4);
+int8 __ovld __cnfn convert_int8(half8);
+int8 __ovld __cnfn convert_int8_rte(half8);
+int8 __ovld __cnfn convert_int8_rtp(half8);
+int8 __ovld __cnfn convert_int8_rtn(half8);
+int8 __ovld __cnfn convert_int8_rtz(half8);
+int8 __ovld __cnfn convert_int8_sat(half8);
+int8 __ovld __cnfn convert_int8_sat_rte(half8);
+int8 __ovld __cnfn convert_int8_sat_rtp(half8);
+int8 __ovld __cnfn convert_int8_sat_rtn(half8);
+int8 __ovld __cnfn convert_int8_sat_rtz(half8);
+int16 __ovld __cnfn convert_int16(half16);
+int16 __ovld __cnfn convert_int16_rte(half16);
+int16 __ovld __cnfn convert_int16_rtp(half16);
+int16 __ovld __cnfn convert_int16_rtn(half16);
+int16 __ovld __cnfn convert_int16_rtz(half16);
+int16 __ovld __cnfn convert_int16_sat(half16);
+int16 __ovld __cnfn convert_int16_sat_rte(half16);
+int16 __ovld __cnfn convert_int16_sat_rtp(half16);
+int16 __ovld __cnfn convert_int16_sat_rtn(half16);
+int16 __ovld __cnfn convert_int16_sat_rtz(half16);
+long __ovld __cnfn convert_long(half);
+long __ovld __cnfn convert_long_rte(half);
+long __ovld __cnfn convert_long_rtp(half);
+long __ovld __cnfn convert_long_rtn(half);
+long __ovld __cnfn convert_long_rtz(half);
+long __ovld __cnfn convert_long_sat(half);
+long __ovld __cnfn convert_long_sat_rte(half);
+long __ovld __cnfn convert_long_sat_rtp(half);
+long __ovld __cnfn convert_long_sat_rtn(half);
+long __ovld __cnfn convert_long_sat_rtz(half);
+long2 __ovld __cnfn convert_long2(half2);
+long2 __ovld __cnfn convert_long2_rte(half2);
+long2 __ovld __cnfn convert_long2_rtp(half2);
+long2 __ovld __cnfn convert_long2_rtn(half2);
+long2 __ovld __cnfn convert_long2_rtz(half2);
+long2 __ovld __cnfn convert_long2_sat(half2);
+long2 __ovld __cnfn convert_long2_sat_rte(half2);
+long2 __ovld __cnfn convert_long2_sat_rtp(half2);
+long2 __ovld __cnfn convert_long2_sat_rtn(half2);
+long2 __ovld __cnfn convert_long2_sat_rtz(half2);
+long3 __ovld __cnfn convert_long3(half3);
+long3 __ovld __cnfn convert_long3_rte(half3);
+long3 __ovld __cnfn convert_long3_rtp(half3);
+long3 __ovld __cnfn convert_long3_rtn(half3);
+long3 __ovld __cnfn convert_long3_rtz(half3);
+long3 __ovld __cnfn convert_long3_sat(half3);
+long3 __ovld __cnfn convert_long3_sat_rte(half3);
+long3 __ovld __cnfn convert_long3_sat_rtp(half3);
+long3 __ovld __cnfn convert_long3_sat_rtn(half3);
+long3 __ovld __cnfn convert_long3_sat_rtz(half3);
+long4 __ovld __cnfn convert_long4(half4);
+long4 __ovld __cnfn convert_long4_rte(half4);
+long4 __ovld __cnfn convert_long4_rtp(half4);
+long4 __ovld __cnfn convert_long4_rtn(half4);
+long4 __ovld __cnfn convert_long4_rtz(half4);
+long4 __ovld __cnfn convert_long4_sat(half4);
+long4 __ovld __cnfn convert_long4_sat_rte(half4);
+long4 __ovld __cnfn convert_long4_sat_rtp(half4);
+long4 __ovld __cnfn convert_long4_sat_rtn(half4);
+long4 __ovld __cnfn convert_long4_sat_rtz(half4);
+long8 __ovld __cnfn convert_long8(half8);
+long8 __ovld __cnfn convert_long8_rte(half8);
+long8 __ovld __cnfn convert_long8_rtp(half8);
+long8 __ovld __cnfn convert_long8_rtn(half8);
+long8 __ovld __cnfn convert_long8_rtz(half8);
+long8 __ovld __cnfn convert_long8_sat(half8);
+long8 __ovld __cnfn convert_long8_sat_rte(half8);
+long8 __ovld __cnfn convert_long8_sat_rtp(half8);
+long8 __ovld __cnfn convert_long8_sat_rtn(half8);
+long8 __ovld __cnfn convert_long8_sat_rtz(half8);
+long16 __ovld __cnfn convert_long16(half16);
+long16 __ovld __cnfn convert_long16_rte(half16);
+long16 __ovld __cnfn convert_long16_rtp(half16);
+long16 __ovld __cnfn convert_long16_rtn(half16);
+long16 __ovld __cnfn convert_long16_rtz(half16);
+long16 __ovld __cnfn convert_long16_sat(half16);
+long16 __ovld __cnfn convert_long16_sat_rte(half16);
+long16 __ovld __cnfn convert_long16_sat_rtp(half16);
+long16 __ovld __cnfn convert_long16_sat_rtn(half16);
+long16 __ovld __cnfn convert_long16_sat_rtz(half16);
+float __ovld __cnfn convert_float(half);
+float __ovld __cnfn convert_float_rte(half);
+float __ovld __cnfn convert_float_rtp(half);
+float __ovld __cnfn convert_float_rtn(half);
+float __ovld __cnfn convert_float_rtz(half);
+float2 __ovld __cnfn convert_float2(half2);
+float2 __ovld __cnfn convert_float2_rte(half2);
+float2 __ovld __cnfn convert_float2_rtp(half2);
+float2 __ovld __cnfn convert_float2_rtn(half2);
+float2 __ovld __cnfn convert_float2_rtz(half2);
+float3 __ovld __cnfn convert_float3(half3);
+float3 __ovld __cnfn convert_float3_rte(half3);
+float3 __ovld __cnfn convert_float3_rtp(half3);
+float3 __ovld __cnfn convert_float3_rtn(half3);
+float3 __ovld __cnfn convert_float3_rtz(half3);
+float4 __ovld __cnfn convert_float4(half4);
+float4 __ovld __cnfn convert_float4_rte(half4);
+float4 __ovld __cnfn convert_float4_rtp(half4);
+float4 __ovld __cnfn convert_float4_rtn(half4);
+float4 __ovld __cnfn convert_float4_rtz(half4);
+float8 __ovld __cnfn convert_float8(half8);
+float8 __ovld __cnfn convert_float8_rte(half8);
+float8 __ovld __cnfn convert_float8_rtp(half8);
+float8 __ovld __cnfn convert_float8_rtn(half8);
+float8 __ovld __cnfn convert_float8_rtz(half8);
+float16 __ovld __cnfn convert_float16(half16);
+float16 __ovld __cnfn convert_float16_rte(half16);
+float16 __ovld __cnfn convert_float16_rtp(half16);
+float16 __ovld __cnfn convert_float16_rtn(half16);
+float16 __ovld __cnfn convert_float16_rtz(half16);
+
+// Convert non-double types to half types.
+half __ovld __cnfn convert_half(uchar);
+half __ovld __cnfn convert_half(ushort);
+half __ovld __cnfn convert_half(uint);
+half __ovld __cnfn convert_half(ulong);
+half __ovld __cnfn convert_half(char);
+half __ovld __cnfn convert_half(short);
+half __ovld __cnfn convert_half(int);
+half __ovld __cnfn convert_half(long);
+half __ovld __cnfn convert_half(float);
+half __ovld __cnfn convert_half(half);
+half __ovld __cnfn convert_half_rte(uchar);
+half __ovld __cnfn convert_half_rte(ushort);
+half __ovld __cnfn convert_half_rte(uint);
+half __ovld __cnfn convert_half_rte(ulong);
+half __ovld __cnfn convert_half_rte(char);
+half __ovld __cnfn convert_half_rte(short);
+half __ovld __cnfn convert_half_rte(int);
+half __ovld __cnfn convert_half_rte(long);
+half __ovld __cnfn convert_half_rte(float);
+half __ovld __cnfn convert_half_rte(half);
+half __ovld __cnfn convert_half_rtp(uchar);
+half __ovld __cnfn convert_half_rtp(ushort);
+half __ovld __cnfn convert_half_rtp(uint);
+half __ovld __cnfn convert_half_rtp(ulong);
+half __ovld __cnfn convert_half_rtp(char);
+half __ovld __cnfn convert_half_rtp(short);
+half __ovld __cnfn convert_half_rtp(int);
+half __ovld __cnfn convert_half_rtp(long);
+half __ovld __cnfn convert_half_rtp(float);
+half __ovld __cnfn convert_half_rtp(half);
+half __ovld __cnfn convert_half_rtn(uchar);
+half __ovld __cnfn convert_half_rtn(ushort);
+half __ovld __cnfn convert_half_rtn(uint);
+half __ovld __cnfn convert_half_rtn(ulong);
+half __ovld __cnfn convert_half_rtn(char);
+half __ovld __cnfn convert_half_rtn(short);
+half __ovld __cnfn convert_half_rtn(int);
+half __ovld __cnfn convert_half_rtn(long);
+half __ovld __cnfn convert_half_rtn(float);
+half __ovld __cnfn convert_half_rtn(half);
+half __ovld __cnfn convert_half_rtz(uchar);
+half __ovld __cnfn convert_half_rtz(ushort);
+half __ovld __cnfn convert_half_rtz(uint);
+half __ovld __cnfn convert_half_rtz(ulong);
+half __ovld __cnfn convert_half_rtz(char);
+half __ovld __cnfn convert_half_rtz(short);
+half __ovld __cnfn convert_half_rtz(int);
+half __ovld __cnfn convert_half_rtz(long);
+half __ovld __cnfn convert_half_rtz(float);
+half __ovld __cnfn convert_half_rtz(half);
+half2 __ovld __cnfn convert_half2(char2);
+half2 __ovld __cnfn convert_half2(uchar2);
+half2 __ovld __cnfn convert_half2(short2);
+half2 __ovld __cnfn convert_half2(ushort2);
+half2 __ovld __cnfn convert_half2(int2);
+half2 __ovld __cnfn convert_half2(uint2);
+half2 __ovld __cnfn convert_half2(long2);
+half2 __ovld __cnfn convert_half2(ulong2);
+half2 __ovld __cnfn convert_half2(float2);
+half2 __ovld __cnfn convert_half2(half2);
+half2 __ovld __cnfn convert_half2_rte(char2);
+half2 __ovld __cnfn convert_half2_rte(uchar2);
+half2 __ovld __cnfn convert_half2_rte(short2);
+half2 __ovld __cnfn convert_half2_rte(ushort2);
+half2 __ovld __cnfn convert_half2_rte(int2);
+half2 __ovld __cnfn convert_half2_rte(uint2);
+half2 __ovld __cnfn convert_half2_rte(long2);
+half2 __ovld __cnfn convert_half2_rte(ulong2);
+half2 __ovld __cnfn convert_half2_rte(float2);
+half2 __ovld __cnfn convert_half2_rte(half2);
+half2 __ovld __cnfn convert_half2_rtp(char2);
+half2 __ovld __cnfn convert_half2_rtp(uchar2);
+half2 __ovld __cnfn convert_half2_rtp(short2);
+half2 __ovld __cnfn convert_half2_rtp(ushort2);
+half2 __ovld __cnfn convert_half2_rtp(int2);
+half2 __ovld __cnfn convert_half2_rtp(uint2);
+half2 __ovld __cnfn convert_half2_rtp(long2);
+half2 __ovld __cnfn convert_half2_rtp(ulong2);
+half2 __ovld __cnfn convert_half2_rtp(float2);
+half2 __ovld __cnfn convert_half2_rtp(half2);
+half2 __ovld __cnfn convert_half2_rtn(char2);
+half2 __ovld __cnfn convert_half2_rtn(uchar2);
+half2 __ovld __cnfn convert_half2_rtn(short2);
+half2 __ovld __cnfn convert_half2_rtn(ushort2);
+half2 __ovld __cnfn convert_half2_rtn(int2);
+half2 __ovld __cnfn convert_half2_rtn(uint2);
+half2 __ovld __cnfn convert_half2_rtn(long2);
+half2 __ovld __cnfn convert_half2_rtn(ulong2);
+half2 __ovld __cnfn convert_half2_rtn(float2);
+half2 __ovld __cnfn convert_half2_rtn(half2);
+half2 __ovld __cnfn convert_half2_rtz(char2);
+half2 __ovld __cnfn convert_half2_rtz(uchar2);
+half2 __ovld __cnfn convert_half2_rtz(short2);
+half2 __ovld __cnfn convert_half2_rtz(ushort2);
+half2 __ovld __cnfn convert_half2_rtz(int2);
+half2 __ovld __cnfn convert_half2_rtz(uint2);
+half2 __ovld __cnfn convert_half2_rtz(long2);
+half2 __ovld __cnfn convert_half2_rtz(ulong2);
+half2 __ovld __cnfn convert_half2_rtz(float2);
+half2 __ovld __cnfn convert_half2_rtz(half2);
+half3 __ovld __cnfn convert_half3(char3);
+half3 __ovld __cnfn convert_half3(uchar3);
+half3 __ovld __cnfn convert_half3(short3);
+half3 __ovld __cnfn convert_half3(ushort3);
+half3 __ovld __cnfn convert_half3(int3);
+half3 __ovld __cnfn convert_half3(uint3);
+half3 __ovld __cnfn convert_half3(long3);
+half3 __ovld __cnfn convert_half3(ulong3);
+half3 __ovld __cnfn convert_half3(float3);
+half3 __ovld __cnfn convert_half3(half3);
+half3 __ovld __cnfn convert_half3_rte(char3);
+half3 __ovld __cnfn convert_half3_rte(uchar3);
+half3 __ovld __cnfn convert_half3_rte(short3);
+half3 __ovld __cnfn convert_half3_rte(ushort3);
+half3 __ovld __cnfn convert_half3_rte(int3);
+half3 __ovld __cnfn convert_half3_rte(uint3);
+half3 __ovld __cnfn convert_half3_rte(long3);
+half3 __ovld __cnfn convert_half3_rte(ulong3);
+half3 __ovld __cnfn convert_half3_rte(float3);
+half3 __ovld __cnfn convert_half3_rte(half3);
+half3 __ovld __cnfn convert_half3_rtp(char3);
+half3 __ovld __cnfn convert_half3_rtp(uchar3);
+half3 __ovld __cnfn convert_half3_rtp(short3);
+half3 __ovld __cnfn convert_half3_rtp(ushort3);
+half3 __ovld __cnfn convert_half3_rtp(int3);
+half3 __ovld __cnfn convert_half3_rtp(uint3);
+half3 __ovld __cnfn convert_half3_rtp(long3);
+half3 __ovld __cnfn convert_half3_rtp(ulong3);
+half3 __ovld __cnfn convert_half3_rtp(float3);
+half3 __ovld __cnfn convert_half3_rtp(half3);
+half3 __ovld __cnfn convert_half3_rtn(char3);
+half3 __ovld __cnfn convert_half3_rtn(uchar3);
+half3 __ovld __cnfn convert_half3_rtn(short3);
+half3 __ovld __cnfn convert_half3_rtn(ushort3);
+half3 __ovld __cnfn convert_half3_rtn(int3);
+half3 __ovld __cnfn convert_half3_rtn(uint3);
+half3 __ovld __cnfn convert_half3_rtn(long3);
+half3 __ovld __cnfn convert_half3_rtn(ulong3);
+half3 __ovld __cnfn convert_half3_rtn(float3);
+half3 __ovld __cnfn convert_half3_rtn(half3);
+half3 __ovld __cnfn convert_half3_rtz(char3);
+half3 __ovld __cnfn convert_half3_rtz(uchar3);
+half3 __ovld __cnfn convert_half3_rtz(short3);
+half3 __ovld __cnfn convert_half3_rtz(ushort3);
+half3 __ovld __cnfn convert_half3_rtz(int3);
+half3 __ovld __cnfn convert_half3_rtz(uint3);
+half3 __ovld __cnfn convert_half3_rtz(long3);
+half3 __ovld __cnfn convert_half3_rtz(ulong3);
+half3 __ovld __cnfn convert_half3_rtz(float3);
+half3 __ovld __cnfn convert_half3_rtz(half3);
+half4 __ovld __cnfn convert_half4(char4);
+half4 __ovld __cnfn convert_half4(uchar4);
+half4 __ovld __cnfn convert_half4(short4);
+half4 __ovld __cnfn convert_half4(ushort4);
+half4 __ovld __cnfn convert_half4(int4);
+half4 __ovld __cnfn convert_half4(uint4);
+half4 __ovld __cnfn convert_half4(long4);
+half4 __ovld __cnfn convert_half4(ulong4);
+half4 __ovld __cnfn convert_half4(float4);
+half4 __ovld __cnfn convert_half4(half4);
+half4 __ovld __cnfn convert_half4_rte(char4);
+half4 __ovld __cnfn convert_half4_rte(uchar4);
+half4 __ovld __cnfn convert_half4_rte(short4);
+half4 __ovld __cnfn convert_half4_rte(ushort4);
+half4 __ovld __cnfn convert_half4_rte(int4);
+half4 __ovld __cnfn convert_half4_rte(uint4);
+half4 __ovld __cnfn convert_half4_rte(long4);
+half4 __ovld __cnfn convert_half4_rte(ulong4);
+half4 __ovld __cnfn convert_half4_rte(float4);
+half4 __ovld __cnfn convert_half4_rte(half4);
+half4 __ovld __cnfn convert_half4_rtp(char4);
+half4 __ovld __cnfn convert_half4_rtp(uchar4);
+half4 __ovld __cnfn convert_half4_rtp(short4);
+half4 __ovld __cnfn convert_half4_rtp(ushort4);
+half4 __ovld __cnfn convert_half4_rtp(int4);
+half4 __ovld __cnfn convert_half4_rtp(uint4);
+half4 __ovld __cnfn convert_half4_rtp(long4);
+half4 __ovld __cnfn convert_half4_rtp(ulong4);
+half4 __ovld __cnfn convert_half4_rtp(float4);
+half4 __ovld __cnfn convert_half4_rtp(half4);
+half4 __ovld __cnfn convert_half4_rtn(char4);
+half4 __ovld __cnfn convert_half4_rtn(uchar4);
+half4 __ovld __cnfn convert_half4_rtn(short4);
+half4 __ovld __cnfn convert_half4_rtn(ushort4);
+half4 __ovld __cnfn convert_half4_rtn(int4);
+half4 __ovld __cnfn convert_half4_rtn(uint4);
+half4 __ovld __cnfn convert_half4_rtn(long4);
+half4 __ovld __cnfn convert_half4_rtn(ulong4);
+half4 __ovld __cnfn convert_half4_rtn(float4);
+half4 __ovld __cnfn convert_half4_rtn(half4);
+half4 __ovld __cnfn convert_half4_rtz(char4);
+half4 __ovld __cnfn convert_half4_rtz(uchar4);
+half4 __ovld __cnfn convert_half4_rtz(short4);
+half4 __ovld __cnfn convert_half4_rtz(ushort4);
+half4 __ovld __cnfn convert_half4_rtz(int4);
+half4 __ovld __cnfn convert_half4_rtz(uint4);
+half4 __ovld __cnfn convert_half4_rtz(long4);
+half4 __ovld __cnfn convert_half4_rtz(ulong4);
+half4 __ovld __cnfn convert_half4_rtz(float4);
+half4 __ovld __cnfn convert_half4_rtz(half4);
+half8 __ovld __cnfn convert_half8(char8);
+half8 __ovld __cnfn convert_half8(uchar8);
+half8 __ovld __cnfn convert_half8(short8);
+half8 __ovld __cnfn convert_half8(ushort8);
+half8 __ovld __cnfn convert_half8(int8);
+half8 __ovld __cnfn convert_half8(uint8);
+half8 __ovld __cnfn convert_half8(long8);
+half8 __ovld __cnfn convert_half8(ulong8);
+half8 __ovld __cnfn convert_half8(float8);
+half8 __ovld __cnfn convert_half8(half8);
+half8 __ovld __cnfn convert_half8_rte(char8);
+half8 __ovld __cnfn convert_half8_rte(uchar8);
+half8 __ovld __cnfn convert_half8_rte(short8);
+half8 __ovld __cnfn convert_half8_rte(ushort8);
+half8 __ovld __cnfn convert_half8_rte(int8);
+half8 __ovld __cnfn convert_half8_rte(uint8);
+half8 __ovld __cnfn convert_half8_rte(long8);
+half8 __ovld __cnfn convert_half8_rte(ulong8);
+half8 __ovld __cnfn convert_half8_rte(float8);
+half8 __ovld __cnfn convert_half8_rte(half8);
+half8 __ovld __cnfn convert_half8_rtp(char8);
+half8 __ovld __cnfn convert_half8_rtp(uchar8);
+half8 __ovld __cnfn convert_half8_rtp(short8);
+half8 __ovld __cnfn convert_half8_rtp(ushort8);
+half8 __ovld __cnfn convert_half8_rtp(int8);
+half8 __ovld __cnfn convert_half8_rtp(uint8);
+half8 __ovld __cnfn convert_half8_rtp(long8);
+half8 __ovld __cnfn convert_half8_rtp(ulong8);
+half8 __ovld __cnfn convert_half8_rtp(float8);
+half8 __ovld __cnfn convert_half8_rtp(half8);
+half8 __ovld __cnfn convert_half8_rtn(char8);
+half8 __ovld __cnfn convert_half8_rtn(uchar8);
+half8 __ovld __cnfn convert_half8_rtn(short8);
+half8 __ovld __cnfn convert_half8_rtn(ushort8);
+half8 __ovld __cnfn convert_half8_rtn(int8);
+half8 __ovld __cnfn convert_half8_rtn(uint8);
+half8 __ovld __cnfn convert_half8_rtn(long8);
+half8 __ovld __cnfn convert_half8_rtn(ulong8);
+half8 __ovld __cnfn convert_half8_rtn(float8);
+half8 __ovld __cnfn convert_half8_rtn(half8);
+half8 __ovld __cnfn convert_half8_rtz(char8);
+half8 __ovld __cnfn convert_half8_rtz(uchar8);
+half8 __ovld __cnfn convert_half8_rtz(short8);
+half8 __ovld __cnfn convert_half8_rtz(ushort8);
+half8 __ovld __cnfn convert_half8_rtz(int8);
+half8 __ovld __cnfn convert_half8_rtz(uint8);
+half8 __ovld __cnfn convert_half8_rtz(long8);
+half8 __ovld __cnfn convert_half8_rtz(ulong8);
+half8 __ovld __cnfn convert_half8_rtz(float8);
+half8 __ovld __cnfn convert_half8_rtz(half8);
+half16 __ovld __cnfn convert_half16(char16);
+half16 __ovld __cnfn convert_half16(uchar16);
+half16 __ovld __cnfn convert_half16(short16);
+half16 __ovld __cnfn convert_half16(ushort16);
+half16 __ovld __cnfn convert_half16(int16);
+half16 __ovld __cnfn convert_half16(uint16);
+half16 __ovld __cnfn convert_half16(long16);
+half16 __ovld __cnfn convert_half16(ulong16);
+half16 __ovld __cnfn convert_half16(float16);
+half16 __ovld __cnfn convert_half16(half16);
+half16 __ovld __cnfn convert_half16_rte(char16);
+half16 __ovld __cnfn convert_half16_rte(uchar16);
+half16 __ovld __cnfn convert_half16_rte(short16);
+half16 __ovld __cnfn convert_half16_rte(ushort16);
+half16 __ovld __cnfn convert_half16_rte(int16);
+half16 __ovld __cnfn convert_half16_rte(uint16);
+half16 __ovld __cnfn convert_half16_rte(long16);
+half16 __ovld __cnfn convert_half16_rte(ulong16);
+half16 __ovld __cnfn convert_half16_rte(float16);
+half16 __ovld __cnfn convert_half16_rte(half16);
+half16 __ovld __cnfn convert_half16_rtp(char16);
+half16 __ovld __cnfn convert_half16_rtp(uchar16);
+half16 __ovld __cnfn convert_half16_rtp(short16);
+half16 __ovld __cnfn convert_half16_rtp(ushort16);
+half16 __ovld __cnfn convert_half16_rtp(int16);
+half16 __ovld __cnfn convert_half16_rtp(uint16);
+half16 __ovld __cnfn convert_half16_rtp(long16);
+half16 __ovld __cnfn convert_half16_rtp(ulong16);
+half16 __ovld __cnfn convert_half16_rtp(float16);
+half16 __ovld __cnfn convert_half16_rtp(half16);
+half16 __ovld __cnfn convert_half16_rtn(char16);
+half16 __ovld __cnfn convert_half16_rtn(uchar16);
+half16 __ovld __cnfn convert_half16_rtn(short16);
+half16 __ovld __cnfn convert_half16_rtn(ushort16);
+half16 __ovld __cnfn convert_half16_rtn(int16);
+half16 __ovld __cnfn convert_half16_rtn(uint16);
+half16 __ovld __cnfn convert_half16_rtn(long16);
+half16 __ovld __cnfn convert_half16_rtn(ulong16);
+half16 __ovld __cnfn convert_half16_rtn(float16);
+half16 __ovld __cnfn convert_half16_rtn(half16);
+half16 __ovld __cnfn convert_half16_rtz(char16);
+half16 __ovld __cnfn convert_half16_rtz(uchar16);
+half16 __ovld __cnfn convert_half16_rtz(short16);
+half16 __ovld __cnfn convert_half16_rtz(ushort16);
+half16 __ovld __cnfn convert_half16_rtz(int16);
+half16 __ovld __cnfn convert_half16_rtz(uint16);
+half16 __ovld __cnfn convert_half16_rtz(long16);
+half16 __ovld __cnfn convert_half16_rtz(ulong16);
+half16 __ovld __cnfn convert_half16_rtz(float16);
+half16 __ovld __cnfn convert_half16_rtz(half16);
+
+// Convert half types to double types.
+#ifdef cl_khr_fp64
+double __ovld __cnfn convert_double(half);
+double __ovld __cnfn convert_double_rte(half);
+double __ovld __cnfn convert_double_rtp(half);
+double __ovld __cnfn convert_double_rtn(half);
+double __ovld __cnfn convert_double_rtz(half);
+double2 __ovld __cnfn convert_double2(half2);
+double2 __ovld __cnfn convert_double2_rte(half2);
+double2 __ovld __cnfn convert_double2_rtp(half2);
+double2 __ovld __cnfn convert_double2_rtn(half2);
+double2 __ovld __cnfn convert_double2_rtz(half2);
+double3 __ovld __cnfn convert_double3(half3);
+double3 __ovld __cnfn convert_double3_rte(half3);
+double3 __ovld __cnfn convert_double3_rtp(half3);
+double3 __ovld __cnfn convert_double3_rtn(half3);
+double3 __ovld __cnfn convert_double3_rtz(half3);
+double4 __ovld __cnfn convert_double4(half4);
+double4 __ovld __cnfn convert_double4_rte(half4);
+double4 __ovld __cnfn convert_double4_rtp(half4);
+double4 __ovld __cnfn convert_double4_rtn(half4);
+double4 __ovld __cnfn convert_double4_rtz(half4);
+double8 __ovld __cnfn convert_double8(half8);
+double8 __ovld __cnfn convert_double8_rte(half8);
+double8 __ovld __cnfn convert_double8_rtp(half8);
+double8 __ovld __cnfn convert_double8_rtn(half8);
+double8 __ovld __cnfn convert_double8_rtz(half8);
+double16 __ovld __cnfn convert_double16(half16);
+double16 __ovld __cnfn convert_double16_rte(half16);
+double16 __ovld __cnfn convert_double16_rtp(half16);
+double16 __ovld __cnfn convert_double16_rtn(half16);
+double16 __ovld __cnfn convert_double16_rtz(half16);
+
+// Convert double types to half types.
+half __ovld __cnfn convert_half(double);
+half __ovld __cnfn convert_half_rte(double);
+half __ovld __cnfn convert_half_rtp(double);
+half __ovld __cnfn convert_half_rtn(double);
+half __ovld __cnfn convert_half_rtz(double);
+half2 __ovld __cnfn convert_half2(double2);
+half2 __ovld __cnfn convert_half2_rte(double2);
+half2 __ovld __cnfn convert_half2_rtp(double2);
+half2 __ovld __cnfn convert_half2_rtn(double2);
+half2 __ovld __cnfn convert_half2_rtz(double2);
+half3 __ovld __cnfn convert_half3(double3);
+half3 __ovld __cnfn convert_half3_rte(double3);
+half3 __ovld __cnfn convert_half3_rtp(double3);
+half3 __ovld __cnfn convert_half3_rtn(double3);
+half3 __ovld __cnfn convert_half3_rtz(double3);
+half4 __ovld __cnfn convert_half4(double4);
+half4 __ovld __cnfn convert_half4_rte(double4);
+half4 __ovld __cnfn convert_half4_rtp(double4);
+half4 __ovld __cnfn convert_half4_rtn(double4);
+half4 __ovld __cnfn convert_half4_rtz(double4);
+half8 __ovld __cnfn convert_half8(double8);
+half8 __ovld __cnfn convert_half8_rte(double8);
+half8 __ovld __cnfn convert_half8_rtp(double8);
+half8 __ovld __cnfn convert_half8_rtn(double8);
+half8 __ovld __cnfn convert_half8_rtz(double8);
+half16 __ovld __cnfn convert_half16(double16);
+half16 __ovld __cnfn convert_half16_rte(double16);
+half16 __ovld __cnfn convert_half16_rtp(double16);
+half16 __ovld __cnfn convert_half16_rtn(double16);
+half16 __ovld __cnfn convert_half16_rtz(double16);
+#endif //cl_khr_fp64
+
+#endif // cl_khr_fp16
+
+/**
+ * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
+ * Reinterprets a data type as another data type of the same size
+ */
+char __ovld __cnfn as_char(char);
+char __ovld __cnfn as_char(uchar);
+
+char2 __ovld __cnfn as_char2(char2);
+char2 __ovld __cnfn as_char2(uchar2);
+char2 __ovld __cnfn as_char2(short);
+char2 __ovld __cnfn as_char2(ushort);
+
+char3 __ovld __cnfn as_char3(char3);
+char3 __ovld __cnfn as_char3(char4);
+char3 __ovld __cnfn as_char3(uchar3);
+char3 __ovld __cnfn as_char3(uchar4);
+char3 __ovld __cnfn as_char3(short2);
+char3 __ovld __cnfn as_char3(ushort2);
+char3 __ovld __cnfn as_char3(int);
+char3 __ovld __cnfn as_char3(uint);
+char3 __ovld __cnfn as_char3(float);
+
+char4 __ovld __cnfn as_char4(char3);
+char4 __ovld __cnfn as_char4(char4);
+char4 __ovld __cnfn as_char4(uchar3);
+char4 __ovld __cnfn as_char4(uchar4);
+char4 __ovld __cnfn as_char4(short2);
+char4 __ovld __cnfn as_char4(ushort2);
+char4 __ovld __cnfn as_char4(int);
+char4 __ovld __cnfn as_char4(uint);
+char4 __ovld __cnfn as_char4(float);
+
+char8 __ovld __cnfn as_char8(char8);
+char8 __ovld __cnfn as_char8(uchar8);
+char8 __ovld __cnfn as_char8(short3);
+char8 __ovld __cnfn as_char8(short4);
+char8 __ovld __cnfn as_char8(ushort3);
+char8 __ovld __cnfn as_char8(ushort4);
+char8 __ovld __cnfn as_char8(int2);
+char8 __ovld __cnfn as_char8(uint2);
+char8 __ovld __cnfn as_char8(long);
+char8 __ovld __cnfn as_char8(ulong);
+char8 __ovld __cnfn as_char8(float2);
+
+char16 __ovld __cnfn as_char16(char16);
+char16 __ovld __cnfn as_char16(uchar16);
+char16 __ovld __cnfn as_char16(short8);
+char16 __ovld __cnfn as_char16(ushort8);
+char16 __ovld __cnfn as_char16(int3);
+char16 __ovld __cnfn as_char16(int4);
+char16 __ovld __cnfn as_char16(uint3);
+char16 __ovld __cnfn as_char16(uint4);
+char16 __ovld __cnfn as_char16(long2);
+char16 __ovld __cnfn as_char16(ulong2);
+char16 __ovld __cnfn as_char16(float3);
+char16 __ovld __cnfn as_char16(float4);
+
+uchar __ovld __cnfn as_uchar(char);
+uchar __ovld __cnfn as_uchar(uchar);
+
+uchar2 __ovld __cnfn as_uchar2(char2);
+uchar2 __ovld __cnfn as_uchar2(uchar2);
+uchar2 __ovld __cnfn as_uchar2(short);
+uchar2 __ovld __cnfn as_uchar2(ushort);
+
+uchar3 __ovld __cnfn as_uchar3(char3);
+uchar3 __ovld __cnfn as_uchar3(char4);
+uchar3 __ovld __cnfn as_uchar3(uchar3);
+uchar3 __ovld __cnfn as_uchar3(uchar4);
+uchar3 __ovld __cnfn as_uchar3(short2);
+uchar3 __ovld __cnfn as_uchar3(ushort2);
+uchar3 __ovld __cnfn as_uchar3(int);
+uchar3 __ovld __cnfn as_uchar3(uint);
+uchar3 __ovld __cnfn as_uchar3(float);
+
+uchar4 __ovld __cnfn as_uchar4(char3);
+uchar4 __ovld __cnfn as_uchar4(char4);
+uchar4 __ovld __cnfn as_uchar4(uchar3);
+uchar4 __ovld __cnfn as_uchar4(uchar4);
+uchar4 __ovld __cnfn as_uchar4(short2);
+uchar4 __ovld __cnfn as_uchar4(ushort2);
+uchar4 __ovld __cnfn as_uchar4(int);
+uchar4 __ovld __cnfn as_uchar4(uint);
+uchar4 __ovld __cnfn as_uchar4(float);
+
+uchar8 __ovld __cnfn as_uchar8(char8);
+uchar8 __ovld __cnfn as_uchar8(uchar8);
+uchar8 __ovld __cnfn as_uchar8(short3);
+uchar8 __ovld __cnfn as_uchar8(short4);
+uchar8 __ovld __cnfn as_uchar8(ushort3);
+uchar8 __ovld __cnfn as_uchar8(ushort4);
+uchar8 __ovld __cnfn as_uchar8(int2);
+uchar8 __ovld __cnfn as_uchar8(uint2);
+uchar8 __ovld __cnfn as_uchar8(long);
+uchar8 __ovld __cnfn as_uchar8(ulong);
+uchar8 __ovld __cnfn as_uchar8(float2);
+
+uchar16 __ovld __cnfn as_uchar16(char16);
+uchar16 __ovld __cnfn as_uchar16(uchar16);
+uchar16 __ovld __cnfn as_uchar16(short8);
+uchar16 __ovld __cnfn as_uchar16(ushort8);
+uchar16 __ovld __cnfn as_uchar16(int3);
+uchar16 __ovld __cnfn as_uchar16(int4);
+uchar16 __ovld __cnfn as_uchar16(uint3);
+uchar16 __ovld __cnfn as_uchar16(uint4);
+uchar16 __ovld __cnfn as_uchar16(long2);
+uchar16 __ovld __cnfn as_uchar16(ulong2);
+uchar16 __ovld __cnfn as_uchar16(float3);
+uchar16 __ovld __cnfn as_uchar16(float4);
+
+short __ovld __cnfn as_short(char2);
+short __ovld __cnfn as_short(uchar2);
+short __ovld __cnfn as_short(short);
+short __ovld __cnfn as_short(ushort);
+
+short2 __ovld __cnfn as_short2(char3);
+short2 __ovld __cnfn as_short2(char4);
+short2 __ovld __cnfn as_short2(uchar3);
+short2 __ovld __cnfn as_short2(uchar4);
+short2 __ovld __cnfn as_short2(short2);
+short2 __ovld __cnfn as_short2(ushort2);
+short2 __ovld __cnfn as_short2(int);
+short2 __ovld __cnfn as_short2(uint);
+short2 __ovld __cnfn as_short2(float);
+
+short3 __ovld __cnfn as_short3(char8);
+short3 __ovld __cnfn as_short3(uchar8);
+short3 __ovld __cnfn as_short3(short3);
+short3 __ovld __cnfn as_short3(short4);
+short3 __ovld __cnfn as_short3(ushort3);
+short3 __ovld __cnfn as_short3(ushort4);
+short3 __ovld __cnfn as_short3(int2);
+short3 __ovld __cnfn as_short3(uint2);
+short3 __ovld __cnfn as_short3(long);
+short3 __ovld __cnfn as_short3(ulong);
+short3 __ovld __cnfn as_short3(float2);
+
+short4 __ovld __cnfn as_short4(char8);
+short4 __ovld __cnfn as_short4(uchar8);
+short4 __ovld __cnfn as_short4(short3);
+short4 __ovld __cnfn as_short4(short4);
+short4 __ovld __cnfn as_short4(ushort3);
+short4 __ovld __cnfn as_short4(ushort4);
+short4 __ovld __cnfn as_short4(int2);
+short4 __ovld __cnfn as_short4(uint2);
+short4 __ovld __cnfn as_short4(long);
+short4 __ovld __cnfn as_short4(ulong);
+short4 __ovld __cnfn as_short4(float2);
+
+short8 __ovld __cnfn as_short8(char16);
+short8 __ovld __cnfn as_short8(uchar16);
+short8 __ovld __cnfn as_short8(short8);
+short8 __ovld __cnfn as_short8(ushort8);
+short8 __ovld __cnfn as_short8(int3);
+short8 __ovld __cnfn as_short8(int4);
+short8 __ovld __cnfn as_short8(uint3);
+short8 __ovld __cnfn as_short8(uint4);
+short8 __ovld __cnfn as_short8(long2);
+short8 __ovld __cnfn as_short8(ulong2);
+short8 __ovld __cnfn as_short8(float3);
+short8 __ovld __cnfn as_short8(float4);
+
+short16 __ovld __cnfn as_short16(short16);
+short16 __ovld __cnfn as_short16(ushort16);
+short16 __ovld __cnfn as_short16(int8);
+short16 __ovld __cnfn as_short16(uint8);
+short16 __ovld __cnfn as_short16(long3);
+short16 __ovld __cnfn as_short16(long4);
+short16 __ovld __cnfn as_short16(ulong3);
+short16 __ovld __cnfn as_short16(ulong4);
+short16 __ovld __cnfn as_short16(float8);
+
+ushort __ovld __cnfn as_ushort(char2);
+ushort __ovld __cnfn as_ushort(uchar2);
+ushort __ovld __cnfn as_ushort(short);
+ushort __ovld __cnfn as_ushort(ushort);
+
+ushort2 __ovld __cnfn as_ushort2(char3);
+ushort2 __ovld __cnfn as_ushort2(char4);
+ushort2 __ovld __cnfn as_ushort2(uchar3);
+ushort2 __ovld __cnfn as_ushort2(uchar4);
+ushort2 __ovld __cnfn as_ushort2(short2);
+ushort2 __ovld __cnfn as_ushort2(ushort2);
+ushort2 __ovld __cnfn as_ushort2(int);
+ushort2 __ovld __cnfn as_ushort2(uint);
+ushort2 __ovld __cnfn as_ushort2(float);
+
+ushort3 __ovld __cnfn as_ushort3(char8);
+ushort3 __ovld __cnfn as_ushort3(uchar8);
+ushort3 __ovld __cnfn as_ushort3(short3);
+ushort3 __ovld __cnfn as_ushort3(short4);
+ushort3 __ovld __cnfn as_ushort3(ushort3);
+ushort3 __ovld __cnfn as_ushort3(ushort4);
+ushort3 __ovld __cnfn as_ushort3(int2);
+ushort3 __ovld __cnfn as_ushort3(uint2);
+ushort3 __ovld __cnfn as_ushort3(long);
+ushort3 __ovld __cnfn as_ushort3(ulong);
+ushort3 __ovld __cnfn as_ushort3(float2);
+
+ushort4 __ovld __cnfn as_ushort4(char8);
+ushort4 __ovld __cnfn as_ushort4(uchar8);
+ushort4 __ovld __cnfn as_ushort4(short3);
+ushort4 __ovld __cnfn as_ushort4(short4);
+ushort4 __ovld __cnfn as_ushort4(ushort3);
+ushort4 __ovld __cnfn as_ushort4(ushort4);
+ushort4 __ovld __cnfn as_ushort4(int2);
+ushort4 __ovld __cnfn as_ushort4(uint2);
+ushort4 __ovld __cnfn as_ushort4(long);
+ushort4 __ovld __cnfn as_ushort4(ulong);
+ushort4 __ovld __cnfn as_ushort4(float2);
+
+ushort8 __ovld __cnfn as_ushort8(char16);
+ushort8 __ovld __cnfn as_ushort8(uchar16);
+ushort8 __ovld __cnfn as_ushort8(short8);
+ushort8 __ovld __cnfn as_ushort8(ushort8);
+ushort8 __ovld __cnfn as_ushort8(int3);
+ushort8 __ovld __cnfn as_ushort8(int4);
+ushort8 __ovld __cnfn as_ushort8(uint3);
+ushort8 __ovld __cnfn as_ushort8(uint4);
+ushort8 __ovld __cnfn as_ushort8(long2);
+ushort8 __ovld __cnfn as_ushort8(ulong2);
+ushort8 __ovld __cnfn as_ushort8(float3);
+ushort8 __ovld __cnfn as_ushort8(float4);
+
+ushort16 __ovld __cnfn as_ushort16(short16);
+ushort16 __ovld __cnfn as_ushort16(ushort16);
+ushort16 __ovld __cnfn as_ushort16(int8);
+ushort16 __ovld __cnfn as_ushort16(uint8);
+ushort16 __ovld __cnfn as_ushort16(long3);
+ushort16 __ovld __cnfn as_ushort16(long4);
+ushort16 __ovld __cnfn as_ushort16(ulong3);
+ushort16 __ovld __cnfn as_ushort16(ulong4);
+ushort16 __ovld __cnfn as_ushort16(float8);
+
+int __ovld __cnfn as_int(char3);
+int __ovld __cnfn as_int(char4);
+int __ovld __cnfn as_int(uchar3);
+int __ovld __cnfn as_int(uchar4);
+int __ovld __cnfn as_int(short2);
+int __ovld __cnfn as_int(ushort2);
+int __ovld __cnfn as_int(int);
+int __ovld __cnfn as_int(uint);
+int __ovld __cnfn as_int(float);
+
+int2 __ovld __cnfn as_int2(char8);
+int2 __ovld __cnfn as_int2(uchar8);
+int2 __ovld __cnfn as_int2(short3);
+int2 __ovld __cnfn as_int2(short4);
+int2 __ovld __cnfn as_int2(ushort3);
+int2 __ovld __cnfn as_int2(ushort4);
+int2 __ovld __cnfn as_int2(int2);
+int2 __ovld __cnfn as_int2(uint2);
+int2 __ovld __cnfn as_int2(long);
+int2 __ovld __cnfn as_int2(ulong);
+int2 __ovld __cnfn as_int2(float2);
+
+int3 __ovld __cnfn as_int3(char16);
+int3 __ovld __cnfn as_int3(uchar16);
+int3 __ovld __cnfn as_int3(short8);
+int3 __ovld __cnfn as_int3(ushort8);
+int3 __ovld __cnfn as_int3(int3);
+int3 __ovld __cnfn as_int3(int4);
+int3 __ovld __cnfn as_int3(uint3);
+int3 __ovld __cnfn as_int3(uint4);
+int3 __ovld __cnfn as_int3(long2);
+int3 __ovld __cnfn as_int3(ulong2);
+int3 __ovld __cnfn as_int3(float3);
+int3 __ovld __cnfn as_int3(float4);
+
+int4 __ovld __cnfn as_int4(char16);
+int4 __ovld __cnfn as_int4(uchar16);
+int4 __ovld __cnfn as_int4(short8);
+int4 __ovld __cnfn as_int4(ushort8);
+int4 __ovld __cnfn as_int4(int3);
+int4 __ovld __cnfn as_int4(int4);
+int4 __ovld __cnfn as_int4(uint3);
+int4 __ovld __cnfn as_int4(uint4);
+int4 __ovld __cnfn as_int4(long2);
+int4 __ovld __cnfn as_int4(ulong2);
+int4 __ovld __cnfn as_int4(float3);
+int4 __ovld __cnfn as_int4(float4);
+
+int8 __ovld __cnfn as_int8(short16);
+int8 __ovld __cnfn as_int8(ushort16);
+int8 __ovld __cnfn as_int8(int8);
+int8 __ovld __cnfn as_int8(uint8);
+int8 __ovld __cnfn as_int8(long3);
+int8 __ovld __cnfn as_int8(long4);
+int8 __ovld __cnfn as_int8(ulong3);
+int8 __ovld __cnfn as_int8(ulong4);
+int8 __ovld __cnfn as_int8(float8);
+
+int16 __ovld __cnfn as_int16(int16);
+int16 __ovld __cnfn as_int16(uint16);
+int16 __ovld __cnfn as_int16(long8);
+int16 __ovld __cnfn as_int16(ulong8);
+int16 __ovld __cnfn as_int16(float16);
+
+uint __ovld __cnfn as_uint(char3);
+uint __ovld __cnfn as_uint(char4);
+uint __ovld __cnfn as_uint(uchar3);
+uint __ovld __cnfn as_uint(uchar4);
+uint __ovld __cnfn as_uint(short2);
+uint __ovld __cnfn as_uint(ushort2);
+uint __ovld __cnfn as_uint(int);
+uint __ovld __cnfn as_uint(uint);
+uint __ovld __cnfn as_uint(float);
+
+uint2 __ovld __cnfn as_uint2(char8);
+uint2 __ovld __cnfn as_uint2(uchar8);
+uint2 __ovld __cnfn as_uint2(short3);
+uint2 __ovld __cnfn as_uint2(short4);
+uint2 __ovld __cnfn as_uint2(ushort3);
+uint2 __ovld __cnfn as_uint2(ushort4);
+uint2 __ovld __cnfn as_uint2(int2);
+uint2 __ovld __cnfn as_uint2(uint2);
+uint2 __ovld __cnfn as_uint2(long);
+uint2 __ovld __cnfn as_uint2(ulong);
+uint2 __ovld __cnfn as_uint2(float2);
+
+uint3 __ovld __cnfn as_uint3(char16);
+uint3 __ovld __cnfn as_uint3(uchar16);
+uint3 __ovld __cnfn as_uint3(short8);
+uint3 __ovld __cnfn as_uint3(ushort8);
+uint3 __ovld __cnfn as_uint3(int3);
+uint3 __ovld __cnfn as_uint3(int4);
+uint3 __ovld __cnfn as_uint3(uint3);
+uint3 __ovld __cnfn as_uint3(uint4);
+uint3 __ovld __cnfn as_uint3(long2);
+uint3 __ovld __cnfn as_uint3(ulong2);
+uint3 __ovld __cnfn as_uint3(float3);
+uint3 __ovld __cnfn as_uint3(float4);
+
+uint4 __ovld __cnfn as_uint4(char16);
+uint4 __ovld __cnfn as_uint4(uchar16);
+uint4 __ovld __cnfn as_uint4(short8);
+uint4 __ovld __cnfn as_uint4(ushort8);
+uint4 __ovld __cnfn as_uint4(int3);
+uint4 __ovld __cnfn as_uint4(int4);
+uint4 __ovld __cnfn as_uint4(uint3);
+uint4 __ovld __cnfn as_uint4(uint4);
+uint4 __ovld __cnfn as_uint4(long2);
+uint4 __ovld __cnfn as_uint4(ulong2);
+uint4 __ovld __cnfn as_uint4(float3);
+uint4 __ovld __cnfn as_uint4(float4);
+
+uint8 __ovld __cnfn as_uint8(short16);
+uint8 __ovld __cnfn as_uint8(ushort16);
+uint8 __ovld __cnfn as_uint8(int8);
+uint8 __ovld __cnfn as_uint8(uint8);
+uint8 __ovld __cnfn as_uint8(long3);
+uint8 __ovld __cnfn as_uint8(long4);
+uint8 __ovld __cnfn as_uint8(ulong3);
+uint8 __ovld __cnfn as_uint8(ulong4);
+uint8 __ovld __cnfn as_uint8(float8);
+
+uint16 __ovld __cnfn as_uint16(int16);
+uint16 __ovld __cnfn as_uint16(uint16);
+uint16 __ovld __cnfn as_uint16(long8);
+uint16 __ovld __cnfn as_uint16(ulong8);
+uint16 __ovld __cnfn as_uint16(float16);
+
+long __ovld __cnfn as_long(char8);
+long __ovld __cnfn as_long(uchar8);
+long __ovld __cnfn as_long(short3);
+long __ovld __cnfn as_long(short4);
+long __ovld __cnfn as_long(ushort3);
+long __ovld __cnfn as_long(ushort4);
+long __ovld __cnfn as_long(int2);
+long __ovld __cnfn as_long(uint2);
+long __ovld __cnfn as_long(long);
+long __ovld __cnfn as_long(ulong);
+long __ovld __cnfn as_long(float2);
+
+long2 __ovld __cnfn as_long2(char16);
+long2 __ovld __cnfn as_long2(uchar16);
+long2 __ovld __cnfn as_long2(short8);
+long2 __ovld __cnfn as_long2(ushort8);
+long2 __ovld __cnfn as_long2(int3);
+long2 __ovld __cnfn as_long2(int4);
+long2 __ovld __cnfn as_long2(uint3);
+long2 __ovld __cnfn as_long2(uint4);
+long2 __ovld __cnfn as_long2(long2);
+long2 __ovld __cnfn as_long2(ulong2);
+long2 __ovld __cnfn as_long2(float3);
+long2 __ovld __cnfn as_long2(float4);
+
+long3 __ovld __cnfn as_long3(short16);
+long3 __ovld __cnfn as_long3(ushort16);
+long3 __ovld __cnfn as_long3(int8);
+long3 __ovld __cnfn as_long3(uint8);
+long3 __ovld __cnfn as_long3(long3);
+long3 __ovld __cnfn as_long3(long4);
+long3 __ovld __cnfn as_long3(ulong3);
+long3 __ovld __cnfn as_long3(ulong4);
+long3 __ovld __cnfn as_long3(float8);
+
+long4 __ovld __cnfn as_long4(short16);
+long4 __ovld __cnfn as_long4(ushort16);
+long4 __ovld __cnfn as_long4(int8);
+long4 __ovld __cnfn as_long4(uint8);
+long4 __ovld __cnfn as_long4(long3);
+long4 __ovld __cnfn as_long4(long4);
+long4 __ovld __cnfn as_long4(ulong3);
+long4 __ovld __cnfn as_long4(ulong4);
+long4 __ovld __cnfn as_long4(float8);
+
+long8 __ovld __cnfn as_long8(int16);
+long8 __ovld __cnfn as_long8(uint16);
+long8 __ovld __cnfn as_long8(long8);
+long8 __ovld __cnfn as_long8(ulong8);
+long8 __ovld __cnfn as_long8(float16);
+
+long16 __ovld __cnfn as_long16(long16);
+long16 __ovld __cnfn as_long16(ulong16);
+
+ulong __ovld __cnfn as_ulong(char8);
+ulong __ovld __cnfn as_ulong(uchar8);
+ulong __ovld __cnfn as_ulong(short3);
+ulong __ovld __cnfn as_ulong(short4);
+ulong __ovld __cnfn as_ulong(ushort3);
+ulong __ovld __cnfn as_ulong(ushort4);
+ulong __ovld __cnfn as_ulong(int2);
+ulong __ovld __cnfn as_ulong(uint2);
+ulong __ovld __cnfn as_ulong(long);
+ulong __ovld __cnfn as_ulong(ulong);
+ulong __ovld __cnfn as_ulong(float2);
+
+ulong2 __ovld __cnfn as_ulong2(char16);
+ulong2 __ovld __cnfn as_ulong2(uchar16);
+ulong2 __ovld __cnfn as_ulong2(short8);
+ulong2 __ovld __cnfn as_ulong2(ushort8);
+ulong2 __ovld __cnfn as_ulong2(int3);
+ulong2 __ovld __cnfn as_ulong2(int4);
+ulong2 __ovld __cnfn as_ulong2(uint3);
+ulong2 __ovld __cnfn as_ulong2(uint4);
+ulong2 __ovld __cnfn as_ulong2(long2);
+ulong2 __ovld __cnfn as_ulong2(ulong2);
+ulong2 __ovld __cnfn as_ulong2(float3);
+ulong2 __ovld __cnfn as_ulong2(float4);
+
+ulong3 __ovld __cnfn as_ulong3(short16);
+ulong3 __ovld __cnfn as_ulong3(ushort16);
+ulong3 __ovld __cnfn as_ulong3(int8);
+ulong3 __ovld __cnfn as_ulong3(uint8);
+ulong3 __ovld __cnfn as_ulong3(long3);
+ulong3 __ovld __cnfn as_ulong3(long4);
+ulong3 __ovld __cnfn as_ulong3(ulong3);
+ulong3 __ovld __cnfn as_ulong3(ulong4);
+ulong3 __ovld __cnfn as_ulong3(float8);
+
+ulong4 __ovld __cnfn as_ulong4(short16);
+ulong4 __ovld __cnfn as_ulong4(ushort16);
+ulong4 __ovld __cnfn as_ulong4(int8);
+ulong4 __ovld __cnfn as_ulong4(uint8);
+ulong4 __ovld __cnfn as_ulong4(long3);
+ulong4 __ovld __cnfn as_ulong4(long4);
+ulong4 __ovld __cnfn as_ulong4(ulong3);
+ulong4 __ovld __cnfn as_ulong4(ulong4);
+ulong4 __ovld __cnfn as_ulong4(float8);
+
+ulong8 __ovld __cnfn as_ulong8(int16);
+ulong8 __ovld __cnfn as_ulong8(uint16);
+ulong8 __ovld __cnfn as_ulong8(long8);
+ulong8 __ovld __cnfn as_ulong8(ulong8);
+ulong8 __ovld __cnfn as_ulong8(float16);
+
+ulong16 __ovld __cnfn as_ulong16(long16);
+ulong16 __ovld __cnfn as_ulong16(ulong16);
+
+float __ovld __cnfn as_float(char3);
+float __ovld __cnfn as_float(char4);
+float __ovld __cnfn as_float(uchar3);
+float __ovld __cnfn as_float(uchar4);
+float __ovld __cnfn as_float(short2);
+float __ovld __cnfn as_float(ushort2);
+float __ovld __cnfn as_float(int);
+float __ovld __cnfn as_float(uint);
+float __ovld __cnfn as_float(float);
+
+float2 __ovld __cnfn as_float2(char8);
+float2 __ovld __cnfn as_float2(uchar8);
+float2 __ovld __cnfn as_float2(short3);
+float2 __ovld __cnfn as_float2(short4);
+float2 __ovld __cnfn as_float2(ushort3);
+float2 __ovld __cnfn as_float2(ushort4);
+float2 __ovld __cnfn as_float2(int2);
+float2 __ovld __cnfn as_float2(uint2);
+float2 __ovld __cnfn as_float2(long);
+float2 __ovld __cnfn as_float2(ulong);
+float2 __ovld __cnfn as_float2(float2);
+
+float3 __ovld __cnfn as_float3(char16);
+float3 __ovld __cnfn as_float3(uchar16);
+float3 __ovld __cnfn as_float3(short8);
+float3 __ovld __cnfn as_float3(ushort8);
+float3 __ovld __cnfn as_float3(int3);
+float3 __ovld __cnfn as_float3(int4);
+float3 __ovld __cnfn as_float3(uint3);
+float3 __ovld __cnfn as_float3(uint4);
+float3 __ovld __cnfn as_float3(long2);
+float3 __ovld __cnfn as_float3(ulong2);
+float3 __ovld __cnfn as_float3(float3);
+float3 __ovld __cnfn as_float3(float4);
+
+float4 __ovld __cnfn as_float4(char16);
+float4 __ovld __cnfn as_float4(uchar16);
+float4 __ovld __cnfn as_float4(short8);
+float4 __ovld __cnfn as_float4(ushort8);
+float4 __ovld __cnfn as_float4(int3);
+float4 __ovld __cnfn as_float4(int4);
+float4 __ovld __cnfn as_float4(uint3);
+float4 __ovld __cnfn as_float4(uint4);
+float4 __ovld __cnfn as_float4(long2);
+float4 __ovld __cnfn as_float4(ulong2);
+float4 __ovld __cnfn as_float4(float3);
+float4 __ovld __cnfn as_float4(float4);
+
+float8 __ovld __cnfn as_float8(short16);
+float8 __ovld __cnfn as_float8(ushort16);
+float8 __ovld __cnfn as_float8(int8);
+float8 __ovld __cnfn as_float8(uint8);
+float8 __ovld __cnfn as_float8(long3);
+float8 __ovld __cnfn as_float8(long4);
+float8 __ovld __cnfn as_float8(ulong3);
+float8 __ovld __cnfn as_float8(ulong4);
+float8 __ovld __cnfn as_float8(float8);
+
+float16 __ovld __cnfn as_float16(int16);
+float16 __ovld __cnfn as_float16(uint16);
+float16 __ovld __cnfn as_float16(long8);
+float16 __ovld __cnfn as_float16(ulong8);
+float16 __ovld __cnfn as_float16(float16);
+
+#ifdef cl_khr_fp64
+char8 __ovld __cnfn as_char8(double);
+char16 __ovld __cnfn as_char16(double2);
+uchar8 __ovld __cnfn as_uchar8(double);
+uchar16 __ovld __cnfn as_uchar16(double2);
+short3 __ovld __cnfn as_short3(double);
+short4 __ovld __cnfn as_short4(double);
+short8 __ovld __cnfn as_short8(double2);
+short16 __ovld __cnfn as_short16(double3);
+short16 __ovld __cnfn as_short16(double4);
+ushort3 __ovld __cnfn as_ushort3(double);
+ushort4 __ovld __cnfn as_ushort4(double);
+ushort8 __ovld __cnfn as_ushort8(double2);
+ushort16 __ovld __cnfn as_ushort16(double3);
+ushort16 __ovld __cnfn as_ushort16(double4);
+int2 __ovld __cnfn as_int2(double);
+int3 __ovld __cnfn as_int3(double2);
+int4 __ovld __cnfn as_int4(double2);
+int8 __ovld __cnfn as_int8(double3);
+int8 __ovld __cnfn as_int8(double4);
+int16 __ovld __cnfn as_int16(double8);
+uint2 __ovld __cnfn as_uint2(double);
+uint3 __ovld __cnfn as_uint3(double2);
+uint4 __ovld __cnfn as_uint4(double2);
+uint8 __ovld __cnfn as_uint8(double3);
+uint8 __ovld __cnfn as_uint8(double4);
+uint16 __ovld __cnfn as_uint16(double8);
+long __ovld __cnfn as_long(double);
+long2 __ovld __cnfn as_long2(double2);
+long3 __ovld __cnfn as_long3(double3);
+long3 __ovld __cnfn as_long3(double4);
+long4 __ovld __cnfn as_long4(double3);
+long4 __ovld __cnfn as_long4(double4);
+long8 __ovld __cnfn as_long8(double8);
+long16 __ovld __cnfn as_long16(double16);
+ulong __ovld __cnfn as_ulong(double);
+ulong2 __ovld __cnfn as_ulong2(double2);
+ulong3 __ovld __cnfn as_ulong3(double3);
+ulong3 __ovld __cnfn as_ulong3(double4);
+ulong4 __ovld __cnfn as_ulong4(double3);
+ulong4 __ovld __cnfn as_ulong4(double4);
+ulong8 __ovld __cnfn as_ulong8(double8);
+ulong16 __ovld __cnfn as_ulong16(double16);
+float2 __ovld __cnfn as_float2(double);
+float3 __ovld __cnfn as_float3(double2);
+float4 __ovld __cnfn as_float4(double2);
+float8 __ovld __cnfn as_float8(double3);
+float8 __ovld __cnfn as_float8(double4);
+float16 __ovld __cnfn as_float16(double8);
+double __ovld __cnfn as_double(char8);
+double __ovld __cnfn as_double(uchar8);
+double __ovld __cnfn as_double(short3);
+double __ovld __cnfn as_double(short4);
+double __ovld __cnfn as_double(ushort3);
+double __ovld __cnfn as_double(ushort4);
+double __ovld __cnfn as_double(int2);
+double __ovld __cnfn as_double(uint2);
+double __ovld __cnfn as_double(long);
+double __ovld __cnfn as_double(ulong);
+double __ovld __cnfn as_double(float2);
+double __ovld __cnfn as_double(double);
+double2 __ovld __cnfn as_double2(char16);
+double2 __ovld __cnfn as_double2(uchar16);
+double2 __ovld __cnfn as_double2(short8);
+double2 __ovld __cnfn as_double2(ushort8);
+double2 __ovld __cnfn as_double2(int3);
+double2 __ovld __cnfn as_double2(int4);
+double2 __ovld __cnfn as_double2(uint3);
+double2 __ovld __cnfn as_double2(uint4);
+double2 __ovld __cnfn as_double2(long2);
+double2 __ovld __cnfn as_double2(ulong2);
+double2 __ovld __cnfn as_double2(float3);
+double2 __ovld __cnfn as_double2(float4);
+double2 __ovld __cnfn as_double2(double2);
+double3 __ovld __cnfn as_double3(short16);
+double3 __ovld __cnfn as_double3(ushort16);
+double3 __ovld __cnfn as_double3(int8);
+double3 __ovld __cnfn as_double3(uint8);
+double3 __ovld __cnfn as_double3(long3);
+double3 __ovld __cnfn as_double3(long4);
+double3 __ovld __cnfn as_double3(ulong3);
+double3 __ovld __cnfn as_double3(ulong4);
+double3 __ovld __cnfn as_double3(float8);
+double3 __ovld __cnfn as_double3(double3);
+double3 __ovld __cnfn as_double3(double4);
+double4 __ovld __cnfn as_double4(short16);
+double4 __ovld __cnfn as_double4(ushort16);
+double4 __ovld __cnfn as_double4(int8);
+double4 __ovld __cnfn as_double4(uint8);
+double4 __ovld __cnfn as_double4(long3);
+double4 __ovld __cnfn as_double4(long4);
+double4 __ovld __cnfn as_double4(ulong3);
+double4 __ovld __cnfn as_double4(ulong4);
+double4 __ovld __cnfn as_double4(float8);
+double4 __ovld __cnfn as_double4(double3);
+double4 __ovld __cnfn as_double4(double4);
+double8 __ovld __cnfn as_double8(int16);
+double8 __ovld __cnfn as_double8(uint16);
+double8 __ovld __cnfn as_double8(long8);
+double8 __ovld __cnfn as_double8(ulong8);
+double8 __ovld __cnfn as_double8(float16);
+double8 __ovld __cnfn as_double8(double8);
+double16 __ovld __cnfn as_double16(long16);
+double16 __ovld __cnfn as_double16(ulong16);
+double16 __ovld __cnfn as_double16(double16);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+char2 __ovld __cnfn as_char2(half);
+char3 __ovld __cnfn as_char3(half2);
+char4 __ovld __cnfn as_char4(half2);
+char8 __ovld __cnfn as_char8(half3);
+char8 __ovld __cnfn as_char8(half4);
+char16 __ovld __cnfn as_char16(half8);
+uchar2 __ovld __cnfn as_uchar2(half);
+uchar3 __ovld __cnfn as_uchar3(half2);
+uchar4 __ovld __cnfn as_uchar4(half2);
+uchar8 __ovld __cnfn as_uchar8(half3);
+uchar8 __ovld __cnfn as_uchar8(half4);
+uchar16 __ovld __cnfn as_uchar16(half8);
+short __ovld __cnfn as_short(half);
+short2 __ovld __cnfn as_short2(half2);
+short3 __ovld __cnfn as_short3(half3);
+short3 __ovld __cnfn as_short3(half4);
+short4 __ovld __cnfn as_short4(half3);
+short4 __ovld __cnfn as_short4(half4);
+short8 __ovld __cnfn as_short8(half8);
+short16 __ovld __cnfn as_short16(half16);
+ushort __ovld __cnfn as_ushort(half);
+ushort2 __ovld __cnfn as_ushort2(half2);
+ushort3 __ovld __cnfn as_ushort3(half3);
+ushort3 __ovld __cnfn as_ushort3(half4);
+ushort4 __ovld __cnfn as_ushort4(half3);
+ushort4 __ovld __cnfn as_ushort4(half4);
+ushort8 __ovld __cnfn as_ushort8(half8);
+ushort16 __ovld __cnfn as_ushort16(half16);
+int __ovld __cnfn as_int(half2);
+int2 __ovld __cnfn as_int2(half3);
+int2 __ovld __cnfn as_int2(half4);
+int3 __ovld __cnfn as_int3(half8);
+int4 __ovld __cnfn as_int4(half8);
+int8 __ovld __cnfn as_int8(half16);
+uint __ovld __cnfn as_uint(half2);
+uint2 __ovld __cnfn as_uint2(half3);
+uint2 __ovld __cnfn as_uint2(half4);
+uint3 __ovld __cnfn as_uint3(half8);
+uint4 __ovld __cnfn as_uint4(half8);
+uint8 __ovld __cnfn as_uint8(half16);
+long __ovld __cnfn as_long(half3);
+long __ovld __cnfn as_long(half4);
+long2 __ovld __cnfn as_long2(half8);
+long3 __ovld __cnfn as_long3(half16);
+long4 __ovld __cnfn as_long4(half16);
+ulong __ovld __cnfn as_ulong(half3);
+ulong __ovld __cnfn as_ulong(half4);
+ulong2 __ovld __cnfn as_ulong2(half8);
+ulong3 __ovld __cnfn as_ulong3(half16);
+ulong4 __ovld __cnfn as_ulong4(half16);
+half __ovld __cnfn as_half(char2);
+half __ovld __cnfn as_half(uchar2);
+half __ovld __cnfn as_half(short);
+half __ovld __cnfn as_half(ushort);
+half __ovld __cnfn as_half(half);
+half2 __ovld __cnfn as_half2(char3);
+half2 __ovld __cnfn as_half2(char4);
+half2 __ovld __cnfn as_half2(uchar3);
+half2 __ovld __cnfn as_half2(uchar4);
+half2 __ovld __cnfn as_half2(short2);
+half2 __ovld __cnfn as_half2(ushort2);
+half2 __ovld __cnfn as_half2(int);
+half2 __ovld __cnfn as_half2(uint);
+half2 __ovld __cnfn as_half2(half2);
+half2 __ovld __cnfn as_half2(float);
+half3 __ovld __cnfn as_half3(char8);
+half3 __ovld __cnfn as_half3(uchar8);
+half3 __ovld __cnfn as_half3(short3);
+half3 __ovld __cnfn as_half3(short4);
+half3 __ovld __cnfn as_half3(ushort3);
+half3 __ovld __cnfn as_half3(ushort4);
+half3 __ovld __cnfn as_half3(int2);
+half3 __ovld __cnfn as_half3(uint2);
+half3 __ovld __cnfn as_half3(long);
+half3 __ovld __cnfn as_half3(ulong);
+half3 __ovld __cnfn as_half3(half3);
+half3 __ovld __cnfn as_half3(half4);
+half3 __ovld __cnfn as_half3(float2);
+half4 __ovld __cnfn as_half4(char8);
+half4 __ovld __cnfn as_half4(uchar8);
+half4 __ovld __cnfn as_half4(short3);
+half4 __ovld __cnfn as_half4(short4);
+half4 __ovld __cnfn as_half4(ushort3);
+half4 __ovld __cnfn as_half4(ushort4);
+half4 __ovld __cnfn as_half4(int2);
+half4 __ovld __cnfn as_half4(uint2);
+half4 __ovld __cnfn as_half4(long);
+half4 __ovld __cnfn as_half4(ulong);
+half4 __ovld __cnfn as_half4(half3);
+half4 __ovld __cnfn as_half4(half4);
+half4 __ovld __cnfn as_half4(float2);
+half8 __ovld __cnfn as_half8(char16);
+half8 __ovld __cnfn as_half8(uchar16);
+half8 __ovld __cnfn as_half8(short8);
+half8 __ovld __cnfn as_half8(ushort8);
+half8 __ovld __cnfn as_half8(int3);
+half8 __ovld __cnfn as_half8(int4);
+half8 __ovld __cnfn as_half8(uint3);
+half8 __ovld __cnfn as_half8(uint4);
+half8 __ovld __cnfn as_half8(long2);
+half8 __ovld __cnfn as_half8(ulong2);
+half8 __ovld __cnfn as_half8(half8);
+half8 __ovld __cnfn as_half8(float3);
+half8 __ovld __cnfn as_half8(float4);
+half16 __ovld __cnfn as_half16(short16);
+half16 __ovld __cnfn as_half16(ushort16);
+half16 __ovld __cnfn as_half16(int8);
+half16 __ovld __cnfn as_half16(uint8);
+half16 __ovld __cnfn as_half16(long3);
+half16 __ovld __cnfn as_half16(long4);
+half16 __ovld __cnfn as_half16(ulong3);
+half16 __ovld __cnfn as_half16(ulong4);
+half16 __ovld __cnfn as_half16(half16);
+half16 __ovld __cnfn as_half16(float8);
+float __ovld __cnfn as_float(half2);
+float2 __ovld __cnfn as_float2(half3);
+float2 __ovld __cnfn as_float2(half4);
+float3 __ovld __cnfn as_float3(half8);
+float4 __ovld __cnfn as_float4(half8);
+float8 __ovld __cnfn as_float8(half16);
+
+#ifdef cl_khr_fp64
+half3 __ovld __cnfn as_half3(double);
+half4 __ovld __cnfn as_half4(double);
+half8 __ovld __cnfn as_half8(double2);
+half16 __ovld __cnfn as_half16(double3);
+half16 __ovld __cnfn as_half16(double4);
+double __ovld __cnfn as_double(half3);
+double __ovld __cnfn as_double(half4);
+double2 __ovld __cnfn as_double2(half8);
+double3 __ovld __cnfn as_double3(half16);
+double4 __ovld __cnfn as_double4(half16);
+#endif //cl_khr_fp64
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
+
+#define __kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+#define kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+// OpenCL v1.1 s6.11.1, v1.2 s6.12.1, v2.0 s6.13.1 - Work-item Functions
+
+/**
+ * Returns the number of dimensions in use. This is the
+ * value given to the work_dim argument specified in
+ * clEnqueueNDRangeKernel.
+ * For clEnqueueTask, this returns 1.
+ */
+uint __ovld __cnfn get_work_dim(void);
+
+/**
+ * Returns the number of global work-items specified for
+ * dimension identified by dimindx. This value is given by
+ * the global_work_size argument to
+ * clEnqueueNDRangeKernel. Valid values of dimindx
+ * are 0 to get_work_dim() - 1. For other values of
+ * dimindx, get_global_size() returns 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_global_size(uint dimindx);
+
+/**
+ * Returns the unique global work-item ID value for
+ * dimension identified by dimindx. The global work-item
+ * ID specifies the work-item ID based on the number of
+ * global work-items specified to execute the kernel. Valid
+ * values of dimindx are 0 to get_work_dim() - 1. For
+ * other values of dimindx, get_global_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_global_id(uint dimindx);
+
+/**
+ * Returns the number of local work-items specified in
+ * dimension identified by dimindx. This value is given by
+ * the local_work_size argument to
+ * clEnqueueNDRangeKernel if local_work_size is not
+ * NULL; otherwise the OpenCL implementation chooses
+ * an appropriate local_work_size value which is returned
+ * by this function. Valid values of dimindx are 0 to
+ * get_work_dim() - 1. For other values of dimindx,
+ * get_local_size() returns 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_local_size(uint dimindx);
+
+/**
+ * Returns the unique local work-item ID i.e. a work-item
+ * within a specific work-group for dimension identified by
+ * dimindx. Valid values of dimindx are 0 to
+ * get_work_dim() - 1. For other values of dimindx,
+ * get_local_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_local_id(uint dimindx);
+
+/**
+ * Returns the number of work-groups that will execute a
+ * kernel for dimension identified by dimindx.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values of dimindx, get_num_groups () returns
+ * 1.
+ * For clEnqueueTask, this always returns 1.
+ */
+size_t __ovld __cnfn get_num_groups(uint dimindx);
+
+/**
+ * get_group_id returns the work-group ID which is a
+ * number from 0 .. get_num_groups(dimindx) - 1.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values, get_group_id() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_group_id(uint dimindx);
+
+/**
+ * get_global_offset returns the offset values specified in
+ * global_work_offset argument to
+ * clEnqueueNDRangeKernel.
+ * Valid values of dimindx are 0 to get_work_dim() - 1.
+ * For other values, get_global_offset() returns 0.
+ * For clEnqueueTask, this returns 0.
+ */
+size_t __ovld __cnfn get_global_offset(uint dimindx);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+size_t __ovld get_enqueued_local_size(uint dimindx);
+size_t __ovld get_global_linear_id(void);
+size_t __ovld get_local_linear_id(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.2, v1.2 s6.12.2, v2.0 s6.13.2 - Math functions
+
+/**
+ * Arc cosine function.
+ */
+float __ovld __cnfn acos(float);
+float2 __ovld __cnfn acos(float2);
+float3 __ovld __cnfn acos(float3);
+float4 __ovld __cnfn acos(float4);
+float8 __ovld __cnfn acos(float8);
+float16 __ovld __cnfn acos(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acos(double);
+double2 __ovld __cnfn acos(double2);
+double3 __ovld __cnfn acos(double3);
+double4 __ovld __cnfn acos(double4);
+double8 __ovld __cnfn acos(double8);
+double16 __ovld __cnfn acos(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acos(half);
+half2 __ovld __cnfn acos(half2);
+half3 __ovld __cnfn acos(half3);
+half4 __ovld __cnfn acos(half4);
+half8 __ovld __cnfn acos(half8);
+half16 __ovld __cnfn acos(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Inverse hyperbolic cosine.
+ */
+float __ovld __cnfn acosh(float);
+float2 __ovld __cnfn acosh(float2);
+float3 __ovld __cnfn acosh(float3);
+float4 __ovld __cnfn acosh(float4);
+float8 __ovld __cnfn acosh(float8);
+float16 __ovld __cnfn acosh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acosh(double);
+double2 __ovld __cnfn acosh(double2);
+double3 __ovld __cnfn acosh(double3);
+double4 __ovld __cnfn acosh(double4);
+double8 __ovld __cnfn acosh(double8);
+double16 __ovld __cnfn acosh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acosh(half);
+half2 __ovld __cnfn acosh(half2);
+half3 __ovld __cnfn acosh(half3);
+half4 __ovld __cnfn acosh(half4);
+half8 __ovld __cnfn acosh(half8);
+half16 __ovld __cnfn acosh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute acos (x) / PI.
+ */
+float __ovld __cnfn acospi(float x);
+float2 __ovld __cnfn acospi(float2 x);
+float3 __ovld __cnfn acospi(float3 x);
+float4 __ovld __cnfn acospi(float4 x);
+float8 __ovld __cnfn acospi(float8 x);
+float16 __ovld __cnfn acospi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn acospi(double x);
+double2 __ovld __cnfn acospi(double2 x);
+double3 __ovld __cnfn acospi(double3 x);
+double4 __ovld __cnfn acospi(double4 x);
+double8 __ovld __cnfn acospi(double8 x);
+double16 __ovld __cnfn acospi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn acospi(half x);
+half2 __ovld __cnfn acospi(half2 x);
+half3 __ovld __cnfn acospi(half3 x);
+half4 __ovld __cnfn acospi(half4 x);
+half8 __ovld __cnfn acospi(half8 x);
+half16 __ovld __cnfn acospi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc sine function.
+ */
+float __ovld __cnfn asin(float);
+float2 __ovld __cnfn asin(float2);
+float3 __ovld __cnfn asin(float3);
+float4 __ovld __cnfn asin(float4);
+float8 __ovld __cnfn asin(float8);
+float16 __ovld __cnfn asin(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asin(double);
+double2 __ovld __cnfn asin(double2);
+double3 __ovld __cnfn asin(double3);
+double4 __ovld __cnfn asin(double4);
+double8 __ovld __cnfn asin(double8);
+double16 __ovld __cnfn asin(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asin(half);
+half2 __ovld __cnfn asin(half2);
+half3 __ovld __cnfn asin(half3);
+half4 __ovld __cnfn asin(half4);
+half8 __ovld __cnfn asin(half8);
+half16 __ovld __cnfn asin(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Inverse hyperbolic sine.
+ */
+float __ovld __cnfn asinh(float);
+float2 __ovld __cnfn asinh(float2);
+float3 __ovld __cnfn asinh(float3);
+float4 __ovld __cnfn asinh(float4);
+float8 __ovld __cnfn asinh(float8);
+float16 __ovld __cnfn asinh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asinh(double);
+double2 __ovld __cnfn asinh(double2);
+double3 __ovld __cnfn asinh(double3);
+double4 __ovld __cnfn asinh(double4);
+double8 __ovld __cnfn asinh(double8);
+double16 __ovld __cnfn asinh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asinh(half);
+half2 __ovld __cnfn asinh(half2);
+half3 __ovld __cnfn asinh(half3);
+half4 __ovld __cnfn asinh(half4);
+half8 __ovld __cnfn asinh(half8);
+half16 __ovld __cnfn asinh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute asin (x) / PI.
+ */
+float __ovld __cnfn asinpi(float x);
+float2 __ovld __cnfn asinpi(float2 x);
+float3 __ovld __cnfn asinpi(float3 x);
+float4 __ovld __cnfn asinpi(float4 x);
+float8 __ovld __cnfn asinpi(float8 x);
+float16 __ovld __cnfn asinpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn asinpi(double x);
+double2 __ovld __cnfn asinpi(double2 x);
+double3 __ovld __cnfn asinpi(double3 x);
+double4 __ovld __cnfn asinpi(double4 x);
+double8 __ovld __cnfn asinpi(double8 x);
+double16 __ovld __cnfn asinpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn asinpi(half x);
+half2 __ovld __cnfn asinpi(half2 x);
+half3 __ovld __cnfn asinpi(half3 x);
+half4 __ovld __cnfn asinpi(half4 x);
+half8 __ovld __cnfn asinpi(half8 x);
+half16 __ovld __cnfn asinpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc tangent function.
+ */
+float __ovld __cnfn atan(float y_over_x);
+float2 __ovld __cnfn atan(float2 y_over_x);
+float3 __ovld __cnfn atan(float3 y_over_x);
+float4 __ovld __cnfn atan(float4 y_over_x);
+float8 __ovld __cnfn atan(float8 y_over_x);
+float16 __ovld __cnfn atan(float16 y_over_x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan(double y_over_x);
+double2 __ovld __cnfn atan(double2 y_over_x);
+double3 __ovld __cnfn atan(double3 y_over_x);
+double4 __ovld __cnfn atan(double4 y_over_x);
+double8 __ovld __cnfn atan(double8 y_over_x);
+double16 __ovld __cnfn atan(double16 y_over_x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan(half y_over_x);
+half2 __ovld __cnfn atan(half2 y_over_x);
+half3 __ovld __cnfn atan(half3 y_over_x);
+half4 __ovld __cnfn atan(half4 y_over_x);
+half8 __ovld __cnfn atan(half8 y_over_x);
+half16 __ovld __cnfn atan(half16 y_over_x);
+#endif //cl_khr_fp16
+
+/**
+ * Arc tangent of y / x.
+ */
+float __ovld __cnfn atan2(float y, float x);
+float2 __ovld __cnfn atan2(float2 y, float2 x);
+float3 __ovld __cnfn atan2(float3 y, float3 x);
+float4 __ovld __cnfn atan2(float4 y, float4 x);
+float8 __ovld __cnfn atan2(float8 y, float8 x);
+float16 __ovld __cnfn atan2(float16 y, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan2(double y, double x);
+double2 __ovld __cnfn atan2(double2 y, double2 x);
+double3 __ovld __cnfn atan2(double3 y, double3 x);
+double4 __ovld __cnfn atan2(double4 y, double4 x);
+double8 __ovld __cnfn atan2(double8 y, double8 x);
+double16 __ovld __cnfn atan2(double16 y, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan2(half y, half x);
+half2 __ovld __cnfn atan2(half2 y, half2 x);
+half3 __ovld __cnfn atan2(half3 y, half3 x);
+half4 __ovld __cnfn atan2(half4 y, half4 x);
+half8 __ovld __cnfn atan2(half8 y, half8 x);
+half16 __ovld __cnfn atan2(half16 y, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Hyperbolic arc tangent.
+ */
+float __ovld __cnfn atanh(float);
+float2 __ovld __cnfn atanh(float2);
+float3 __ovld __cnfn atanh(float3);
+float4 __ovld __cnfn atanh(float4);
+float8 __ovld __cnfn atanh(float8);
+float16 __ovld __cnfn atanh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atanh(double);
+double2 __ovld __cnfn atanh(double2);
+double3 __ovld __cnfn atanh(double3);
+double4 __ovld __cnfn atanh(double4);
+double8 __ovld __cnfn atanh(double8);
+double16 __ovld __cnfn atanh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atanh(half);
+half2 __ovld __cnfn atanh(half2);
+half3 __ovld __cnfn atanh(half3);
+half4 __ovld __cnfn atanh(half4);
+half8 __ovld __cnfn atanh(half8);
+half16 __ovld __cnfn atanh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute atan (x) / PI.
+ */
+float __ovld __cnfn atanpi(float x);
+float2 __ovld __cnfn atanpi(float2 x);
+float3 __ovld __cnfn atanpi(float3 x);
+float4 __ovld __cnfn atanpi(float4 x);
+float8 __ovld __cnfn atanpi(float8 x);
+float16 __ovld __cnfn atanpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atanpi(double x);
+double2 __ovld __cnfn atanpi(double2 x);
+double3 __ovld __cnfn atanpi(double3 x);
+double4 __ovld __cnfn atanpi(double4 x);
+double8 __ovld __cnfn atanpi(double8 x);
+double16 __ovld __cnfn atanpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atanpi(half x);
+half2 __ovld __cnfn atanpi(half2 x);
+half3 __ovld __cnfn atanpi(half3 x);
+half4 __ovld __cnfn atanpi(half4 x);
+half8 __ovld __cnfn atanpi(half8 x);
+half16 __ovld __cnfn atanpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute atan2 (y, x) / PI.
+ */
+float __ovld __cnfn atan2pi(float y, float x);
+float2 __ovld __cnfn atan2pi(float2 y, float2 x);
+float3 __ovld __cnfn atan2pi(float3 y, float3 x);
+float4 __ovld __cnfn atan2pi(float4 y, float4 x);
+float8 __ovld __cnfn atan2pi(float8 y, float8 x);
+float16 __ovld __cnfn atan2pi(float16 y, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn atan2pi(double y, double x);
+double2 __ovld __cnfn atan2pi(double2 y, double2 x);
+double3 __ovld __cnfn atan2pi(double3 y, double3 x);
+double4 __ovld __cnfn atan2pi(double4 y, double4 x);
+double8 __ovld __cnfn atan2pi(double8 y, double8 x);
+double16 __ovld __cnfn atan2pi(double16 y, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn atan2pi(half y, half x);
+half2 __ovld __cnfn atan2pi(half2 y, half2 x);
+half3 __ovld __cnfn atan2pi(half3 y, half3 x);
+half4 __ovld __cnfn atan2pi(half4 y, half4 x);
+half8 __ovld __cnfn atan2pi(half8 y, half8 x);
+half16 __ovld __cnfn atan2pi(half16 y, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cube-root.
+ */
+float __ovld __cnfn cbrt(float);
+float2 __ovld __cnfn cbrt(float2);
+float3 __ovld __cnfn cbrt(float3);
+float4 __ovld __cnfn cbrt(float4);
+float8 __ovld __cnfn cbrt(float8);
+float16 __ovld __cnfn cbrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cbrt(double);
+double2 __ovld __cnfn cbrt(double2);
+double3 __ovld __cnfn cbrt(double3);
+double4 __ovld __cnfn cbrt(double4);
+double8 __ovld __cnfn cbrt(double8);
+double16 __ovld __cnfn cbrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cbrt(half);
+half2 __ovld __cnfn cbrt(half2);
+half3 __ovld __cnfn cbrt(half3);
+half4 __ovld __cnfn cbrt(half4);
+half8 __ovld __cnfn cbrt(half8);
+half16 __ovld __cnfn cbrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to positive
+ * infinity rounding mode.
+ */
+float __ovld __cnfn ceil(float);
+float2 __ovld __cnfn ceil(float2);
+float3 __ovld __cnfn ceil(float3);
+float4 __ovld __cnfn ceil(float4);
+float8 __ovld __cnfn ceil(float8);
+float16 __ovld __cnfn ceil(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn ceil(double);
+double2 __ovld __cnfn ceil(double2);
+double3 __ovld __cnfn ceil(double3);
+double4 __ovld __cnfn ceil(double4);
+double8 __ovld __cnfn ceil(double8);
+double16 __ovld __cnfn ceil(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn ceil(half);
+half2 __ovld __cnfn ceil(half2);
+half3 __ovld __cnfn ceil(half3);
+half4 __ovld __cnfn ceil(half4);
+half8 __ovld __cnfn ceil(half8);
+half16 __ovld __cnfn ceil(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x with its sign changed to match the sign of y.
+ */
+float __ovld __cnfn copysign(float x, float y);
+float2 __ovld __cnfn copysign(float2 x, float2 y);
+float3 __ovld __cnfn copysign(float3 x, float3 y);
+float4 __ovld __cnfn copysign(float4 x, float4 y);
+float8 __ovld __cnfn copysign(float8 x, float8 y);
+float16 __ovld __cnfn copysign(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn copysign(double x, double y);
+double2 __ovld __cnfn copysign(double2 x, double2 y);
+double3 __ovld __cnfn copysign(double3 x, double3 y);
+double4 __ovld __cnfn copysign(double4 x, double4 y);
+double8 __ovld __cnfn copysign(double8 x, double8 y);
+double16 __ovld __cnfn copysign(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn copysign(half x, half y);
+half2 __ovld __cnfn copysign(half2 x, half2 y);
+half3 __ovld __cnfn copysign(half3 x, half3 y);
+half4 __ovld __cnfn copysign(half4 x, half4 y);
+half8 __ovld __cnfn copysign(half8 x, half8 y);
+half16 __ovld __cnfn copysign(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cosine.
+ */
+float __ovld __cnfn cos(float);
+float2 __ovld __cnfn cos(float2);
+float3 __ovld __cnfn cos(float3);
+float4 __ovld __cnfn cos(float4);
+float8 __ovld __cnfn cos(float8);
+float16 __ovld __cnfn cos(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cos(double);
+double2 __ovld __cnfn cos(double2);
+double3 __ovld __cnfn cos(double3);
+double4 __ovld __cnfn cos(double4);
+double8 __ovld __cnfn cos(double8);
+double16 __ovld __cnfn cos(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cos(half);
+half2 __ovld __cnfn cos(half2);
+half3 __ovld __cnfn cos(half3);
+half4 __ovld __cnfn cos(half4);
+half8 __ovld __cnfn cos(half8);
+half16 __ovld __cnfn cos(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute hyperbolic cosine.
+ */
+float __ovld __cnfn cosh(float);
+float2 __ovld __cnfn cosh(float2);
+float3 __ovld __cnfn cosh(float3);
+float4 __ovld __cnfn cosh(float4);
+float8 __ovld __cnfn cosh(float8);
+float16 __ovld __cnfn cosh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cosh(double);
+double2 __ovld __cnfn cosh(double2);
+double3 __ovld __cnfn cosh(double3);
+double4 __ovld __cnfn cosh(double4);
+double8 __ovld __cnfn cosh(double8);
+double16 __ovld __cnfn cosh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cosh(half);
+half2 __ovld __cnfn cosh(half2);
+half3 __ovld __cnfn cosh(half3);
+half4 __ovld __cnfn cosh(half4);
+half8 __ovld __cnfn cosh(half8);
+half16 __ovld __cnfn cosh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cos (PI * x).
+ */
+float __ovld __cnfn cospi(float x);
+float2 __ovld __cnfn cospi(float2 x);
+float3 __ovld __cnfn cospi(float3 x);
+float4 __ovld __cnfn cospi(float4 x);
+float8 __ovld __cnfn cospi(float8 x);
+float16 __ovld __cnfn cospi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn cospi(double x);
+double2 __ovld __cnfn cospi(double2 x);
+double3 __ovld __cnfn cospi(double3 x);
+double4 __ovld __cnfn cospi(double4 x);
+double8 __ovld __cnfn cospi(double8 x);
+double16 __ovld __cnfn cospi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn cospi(half x);
+half2 __ovld __cnfn cospi(half2 x);
+half3 __ovld __cnfn cospi(half3 x);
+half4 __ovld __cnfn cospi(half4 x);
+half8 __ovld __cnfn cospi(half8 x);
+half16 __ovld __cnfn cospi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Complementary error function.
+ */
+float __ovld __cnfn erfc(float);
+float2 __ovld __cnfn erfc(float2);
+float3 __ovld __cnfn erfc(float3);
+float4 __ovld __cnfn erfc(float4);
+float8 __ovld __cnfn erfc(float8);
+float16 __ovld __cnfn erfc(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn erfc(double);
+double2 __ovld __cnfn erfc(double2);
+double3 __ovld __cnfn erfc(double3);
+double4 __ovld __cnfn erfc(double4);
+double8 __ovld __cnfn erfc(double8);
+double16 __ovld __cnfn erfc(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn erfc(half);
+half2 __ovld __cnfn erfc(half2);
+half3 __ovld __cnfn erfc(half3);
+half4 __ovld __cnfn erfc(half4);
+half8 __ovld __cnfn erfc(half8);
+half16 __ovld __cnfn erfc(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Error function encountered in integrating the
+ * normal distribution.
+ */
+float __ovld __cnfn erf(float);
+float2 __ovld __cnfn erf(float2);
+float3 __ovld __cnfn erf(float3);
+float4 __ovld __cnfn erf(float4);
+float8 __ovld __cnfn erf(float8);
+float16 __ovld __cnfn erf(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn erf(double);
+double2 __ovld __cnfn erf(double2);
+double3 __ovld __cnfn erf(double3);
+double4 __ovld __cnfn erf(double4);
+double8 __ovld __cnfn erf(double8);
+double16 __ovld __cnfn erf(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn erf(half);
+half2 __ovld __cnfn erf(half2);
+half3 __ovld __cnfn erf(half3);
+half4 __ovld __cnfn erf(half4);
+half8 __ovld __cnfn erf(half8);
+half16 __ovld __cnfn erf(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the base e exponential function of x.
+ */
+float __ovld __cnfn exp(float x);
+float2 __ovld __cnfn exp(float2 x);
+float3 __ovld __cnfn exp(float3 x);
+float4 __ovld __cnfn exp(float4 x);
+float8 __ovld __cnfn exp(float8 x);
+float16 __ovld __cnfn exp(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp(double x);
+double2 __ovld __cnfn exp(double2 x);
+double3 __ovld __cnfn exp(double3 x);
+double4 __ovld __cnfn exp(double4 x);
+double8 __ovld __cnfn exp(double8 x);
+double16 __ovld __cnfn exp(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp(half x);
+half2 __ovld __cnfn exp(half2 x);
+half3 __ovld __cnfn exp(half3 x);
+half4 __ovld __cnfn exp(half4 x);
+half8 __ovld __cnfn exp(half8 x);
+half16 __ovld __cnfn exp(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Exponential base 2 function.
+ */
+float __ovld __cnfn exp2(float);
+float2 __ovld __cnfn exp2(float2);
+float3 __ovld __cnfn exp2(float3);
+float4 __ovld __cnfn exp2(float4);
+float8 __ovld __cnfn exp2(float8);
+float16 __ovld __cnfn exp2(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp2(double);
+double2 __ovld __cnfn exp2(double2);
+double3 __ovld __cnfn exp2(double3);
+double4 __ovld __cnfn exp2(double4);
+double8 __ovld __cnfn exp2(double8);
+double16 __ovld __cnfn exp2(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp2(half);
+half2 __ovld __cnfn exp2(half2);
+half3 __ovld __cnfn exp2(half3);
+half4 __ovld __cnfn exp2(half4);
+half8 __ovld __cnfn exp2(half8);
+half16 __ovld __cnfn exp2(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Exponential base 10 function.
+ */
+float __ovld __cnfn exp10(float);
+float2 __ovld __cnfn exp10(float2);
+float3 __ovld __cnfn exp10(float3);
+float4 __ovld __cnfn exp10(float4);
+float8 __ovld __cnfn exp10(float8);
+float16 __ovld __cnfn exp10(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn exp10(double);
+double2 __ovld __cnfn exp10(double2);
+double3 __ovld __cnfn exp10(double3);
+double4 __ovld __cnfn exp10(double4);
+double8 __ovld __cnfn exp10(double8);
+double16 __ovld __cnfn exp10(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn exp10(half);
+half2 __ovld __cnfn exp10(half2);
+half3 __ovld __cnfn exp10(half3);
+half4 __ovld __cnfn exp10(half4);
+half8 __ovld __cnfn exp10(half8);
+half16 __ovld __cnfn exp10(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute e^x- 1.0.
+ */
+float __ovld __cnfn expm1(float x);
+float2 __ovld __cnfn expm1(float2 x);
+float3 __ovld __cnfn expm1(float3 x);
+float4 __ovld __cnfn expm1(float4 x);
+float8 __ovld __cnfn expm1(float8 x);
+float16 __ovld __cnfn expm1(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn expm1(double x);
+double2 __ovld __cnfn expm1(double2 x);
+double3 __ovld __cnfn expm1(double3 x);
+double4 __ovld __cnfn expm1(double4 x);
+double8 __ovld __cnfn expm1(double8 x);
+double16 __ovld __cnfn expm1(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn expm1(half x);
+half2 __ovld __cnfn expm1(half2 x);
+half3 __ovld __cnfn expm1(half3 x);
+half4 __ovld __cnfn expm1(half4 x);
+half8 __ovld __cnfn expm1(half8 x);
+half16 __ovld __cnfn expm1(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute absolute value of a floating-point number.
+ */
+float __ovld __cnfn fabs(float);
+float2 __ovld __cnfn fabs(float2);
+float3 __ovld __cnfn fabs(float3);
+float4 __ovld __cnfn fabs(float4);
+float8 __ovld __cnfn fabs(float8);
+float16 __ovld __cnfn fabs(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fabs(double);
+double2 __ovld __cnfn fabs(double2);
+double3 __ovld __cnfn fabs(double3);
+double4 __ovld __cnfn fabs(double4);
+double8 __ovld __cnfn fabs(double8);
+double16 __ovld __cnfn fabs(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fabs(half);
+half2 __ovld __cnfn fabs(half2);
+half3 __ovld __cnfn fabs(half3);
+half4 __ovld __cnfn fabs(half4);
+half8 __ovld __cnfn fabs(half8);
+half16 __ovld __cnfn fabs(half16);
+#endif //cl_khr_fp16
+
+/**
+ * x - y if x > y, +0 if x is less than or equal to y.
+ */
+float __ovld __cnfn fdim(float x, float y);
+float2 __ovld __cnfn fdim(float2 x, float2 y);
+float3 __ovld __cnfn fdim(float3 x, float3 y);
+float4 __ovld __cnfn fdim(float4 x, float4 y);
+float8 __ovld __cnfn fdim(float8 x, float8 y);
+float16 __ovld __cnfn fdim(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fdim(double x, double y);
+double2 __ovld __cnfn fdim(double2 x, double2 y);
+double3 __ovld __cnfn fdim(double3 x, double3 y);
+double4 __ovld __cnfn fdim(double4 x, double4 y);
+double8 __ovld __cnfn fdim(double8 x, double8 y);
+double16 __ovld __cnfn fdim(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fdim(half x, half y);
+half2 __ovld __cnfn fdim(half2 x, half2 y);
+half3 __ovld __cnfn fdim(half3 x, half3 y);
+half4 __ovld __cnfn fdim(half4 x, half4 y);
+half8 __ovld __cnfn fdim(half8 x, half8 y);
+half16 __ovld __cnfn fdim(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to -ve
+ * infinity rounding mode.
+ */
+float __ovld __cnfn floor(float);
+float2 __ovld __cnfn floor(float2);
+float3 __ovld __cnfn floor(float3);
+float4 __ovld __cnfn floor(float4);
+float8 __ovld __cnfn floor(float8);
+float16 __ovld __cnfn floor(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn floor(double);
+double2 __ovld __cnfn floor(double2);
+double3 __ovld __cnfn floor(double3);
+double4 __ovld __cnfn floor(double4);
+double8 __ovld __cnfn floor(double8);
+double16 __ovld __cnfn floor(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn floor(half);
+half2 __ovld __cnfn floor(half2);
+half3 __ovld __cnfn floor(half3);
+half4 __ovld __cnfn floor(half4);
+half8 __ovld __cnfn floor(half8);
+half16 __ovld __cnfn floor(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the correctly rounded floating-point
+ * representation of the sum of c with the infinitely
+ * precise product of a and b. Rounding of
+ * intermediate products shall not occur. Edge case
+ * behavior is per the IEEE 754-2008 standard.
+ */
+float __ovld __cnfn fma(float a, float b, float c);
+float2 __ovld __cnfn fma(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn fma(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn fma(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn fma(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn fma(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fma(double a, double b, double c);
+double2 __ovld __cnfn fma(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn fma(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn fma(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn fma(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn fma(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fma(half a, half b, half c);
+half2 __ovld __cnfn fma(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn fma(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn fma(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn fma(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn fma(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if x < y, otherwise it returns x. If one
+ * argument is a NaN, fmax() returns the other
+ * argument. If both arguments are NaNs, fmax()
+ * returns a NaN.
+ */
+float __ovld __cnfn fmax(float x, float y);
+float2 __ovld __cnfn fmax(float2 x, float2 y);
+float3 __ovld __cnfn fmax(float3 x, float3 y);
+float4 __ovld __cnfn fmax(float4 x, float4 y);
+float8 __ovld __cnfn fmax(float8 x, float8 y);
+float16 __ovld __cnfn fmax(float16 x, float16 y);
+float2 __ovld __cnfn fmax(float2 x, float y);
+float3 __ovld __cnfn fmax(float3 x, float y);
+float4 __ovld __cnfn fmax(float4 x, float y);
+float8 __ovld __cnfn fmax(float8 x, float y);
+float16 __ovld __cnfn fmax(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmax(double x, double y);
+double2 __ovld __cnfn fmax(double2 x, double2 y);
+double3 __ovld __cnfn fmax(double3 x, double3 y);
+double4 __ovld __cnfn fmax(double4 x, double4 y);
+double8 __ovld __cnfn fmax(double8 x, double8 y);
+double16 __ovld __cnfn fmax(double16 x, double16 y);
+double2 __ovld __cnfn fmax(double2 x, double y);
+double3 __ovld __cnfn fmax(double3 x, double y);
+double4 __ovld __cnfn fmax(double4 x, double y);
+double8 __ovld __cnfn fmax(double8 x, double y);
+double16 __ovld __cnfn fmax(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmax(half x, half y);
+half2 __ovld __cnfn fmax(half2 x, half2 y);
+half3 __ovld __cnfn fmax(half3 x, half3 y);
+half4 __ovld __cnfn fmax(half4 x, half4 y);
+half8 __ovld __cnfn fmax(half8 x, half8 y);
+half16 __ovld __cnfn fmax(half16 x, half16 y);
+half2 __ovld __cnfn fmax(half2 x, half y);
+half3 __ovld __cnfn fmax(half3 x, half y);
+half4 __ovld __cnfn fmax(half4 x, half y);
+half8 __ovld __cnfn fmax(half8 x, half y);
+half16 __ovld __cnfn fmax(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if y < x, otherwise it returns x. If one
+ * argument is a NaN, fmin() returns the other
+ * argument. If both arguments are NaNs, fmin()
+ * returns a NaN.
+ */
+float __ovld __cnfn fmin(float x, float y);
+float2 __ovld __cnfn fmin(float2 x, float2 y);
+float3 __ovld __cnfn fmin(float3 x, float3 y);
+float4 __ovld __cnfn fmin(float4 x, float4 y);
+float8 __ovld __cnfn fmin(float8 x, float8 y);
+float16 __ovld __cnfn fmin(float16 x, float16 y);
+float2 __ovld __cnfn fmin(float2 x, float y);
+float3 __ovld __cnfn fmin(float3 x, float y);
+float4 __ovld __cnfn fmin(float4 x, float y);
+float8 __ovld __cnfn fmin(float8 x, float y);
+float16 __ovld __cnfn fmin(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmin(double x, double y);
+double2 __ovld __cnfn fmin(double2 x, double2 y);
+double3 __ovld __cnfn fmin(double3 x, double3 y);
+double4 __ovld __cnfn fmin(double4 x, double4 y);
+double8 __ovld __cnfn fmin(double8 x, double8 y);
+double16 __ovld __cnfn fmin(double16 x, double16 y);
+double2 __ovld __cnfn fmin(double2 x, double y);
+double3 __ovld __cnfn fmin(double3 x, double y);
+double4 __ovld __cnfn fmin(double4 x, double y);
+double8 __ovld __cnfn fmin(double8 x, double y);
+double16 __ovld __cnfn fmin(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmin(half x, half y);
+half2 __ovld __cnfn fmin(half2 x, half2 y);
+half3 __ovld __cnfn fmin(half3 x, half3 y);
+half4 __ovld __cnfn fmin(half4 x, half4 y);
+half8 __ovld __cnfn fmin(half8 x, half8 y);
+half16 __ovld __cnfn fmin(half16 x, half16 y);
+half2 __ovld __cnfn fmin(half2 x, half y);
+half3 __ovld __cnfn fmin(half3 x, half y);
+half4 __ovld __cnfn fmin(half4 x, half y);
+half8 __ovld __cnfn fmin(half8 x, half y);
+half16 __ovld __cnfn fmin(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Modulus. Returns x - y * trunc (x/y).
+ */
+float __ovld __cnfn fmod(float x, float y);
+float2 __ovld __cnfn fmod(float2 x, float2 y);
+float3 __ovld __cnfn fmod(float3 x, float3 y);
+float4 __ovld __cnfn fmod(float4 x, float4 y);
+float8 __ovld __cnfn fmod(float8 x, float8 y);
+float16 __ovld __cnfn fmod(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn fmod(double x, double y);
+double2 __ovld __cnfn fmod(double2 x, double2 y);
+double3 __ovld __cnfn fmod(double3 x, double3 y);
+double4 __ovld __cnfn fmod(double4 x, double4 y);
+double8 __ovld __cnfn fmod(double8 x, double8 y);
+double16 __ovld __cnfn fmod(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn fmod(half x, half y);
+half2 __ovld __cnfn fmod(half2 x, half2 y);
+half3 __ovld __cnfn fmod(half3 x, half3 y);
+half4 __ovld __cnfn fmod(half4 x, half4 y);
+half8 __ovld __cnfn fmod(half8 x, half8 y);
+half16 __ovld __cnfn fmod(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns fmin(x - floor (x), 0x1.fffffep-1f ).
+ * floor(x) is returned in iptr.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld fract(float x, float *iptr);
+float2 __ovld fract(float2 x, float2 *iptr);
+float3 __ovld fract(float3 x, float3 *iptr);
+float4 __ovld fract(float4 x, float4 *iptr);
+float8 __ovld fract(float8 x, float8 *iptr);
+float16 __ovld fract(float16 x, float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld fract(double x, double *iptr);
+double2 __ovld fract(double2 x, double2 *iptr);
+double3 __ovld fract(double3 x, double3 *iptr);
+double4 __ovld fract(double4 x, double4 *iptr);
+double8 __ovld fract(double8 x, double8 *iptr);
+double16 __ovld fract(double16 x, double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld fract(half x, half *iptr);
+half2 __ovld fract(half2 x, half2 *iptr);
+half3 __ovld fract(half3 x, half3 *iptr);
+half4 __ovld fract(half4 x, half4 *iptr);
+half8 __ovld fract(half8 x, half8 *iptr);
+half16 __ovld fract(half16 x, half16 *iptr);
+#endif //cl_khr_fp16
+#else
+float __ovld fract(float x, __global float *iptr);
+float2 __ovld fract(float2 x, __global float2 *iptr);
+float3 __ovld fract(float3 x, __global float3 *iptr);
+float4 __ovld fract(float4 x, __global float4 *iptr);
+float8 __ovld fract(float8 x, __global float8 *iptr);
+float16 __ovld fract(float16 x, __global float16 *iptr);
+float __ovld fract(float x, __local float *iptr);
+float2 __ovld fract(float2 x, __local float2 *iptr);
+float3 __ovld fract(float3 x, __local float3 *iptr);
+float4 __ovld fract(float4 x, __local float4 *iptr);
+float8 __ovld fract(float8 x, __local float8 *iptr);
+float16 __ovld fract(float16 x, __local float16 *iptr);
+float __ovld fract(float x, __private float *iptr);
+float2 __ovld fract(float2 x, __private float2 *iptr);
+float3 __ovld fract(float3 x, __private float3 *iptr);
+float4 __ovld fract(float4 x, __private float4 *iptr);
+float8 __ovld fract(float8 x, __private float8 *iptr);
+float16 __ovld fract(float16 x, __private float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld fract(double x, __global double *iptr);
+double2 __ovld fract(double2 x, __global double2 *iptr);
+double3 __ovld fract(double3 x, __global double3 *iptr);
+double4 __ovld fract(double4 x, __global double4 *iptr);
+double8 __ovld fract(double8 x, __global double8 *iptr);
+double16 __ovld fract(double16 x, __global double16 *iptr);
+double __ovld fract(double x, __local double *iptr);
+double2 __ovld fract(double2 x, __local double2 *iptr);
+double3 __ovld fract(double3 x, __local double3 *iptr);
+double4 __ovld fract(double4 x, __local double4 *iptr);
+double8 __ovld fract(double8 x, __local double8 *iptr);
+double16 __ovld fract(double16 x, __local double16 *iptr);
+double __ovld fract(double x, __private double *iptr);
+double2 __ovld fract(double2 x, __private double2 *iptr);
+double3 __ovld fract(double3 x, __private double3 *iptr);
+double4 __ovld fract(double4 x, __private double4 *iptr);
+double8 __ovld fract(double8 x, __private double8 *iptr);
+double16 __ovld fract(double16 x, __private double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld fract(half x, __global half *iptr);
+half2 __ovld fract(half2 x, __global half2 *iptr);
+half3 __ovld fract(half3 x, __global half3 *iptr);
+half4 __ovld fract(half4 x, __global half4 *iptr);
+half8 __ovld fract(half8 x, __global half8 *iptr);
+half16 __ovld fract(half16 x, __global half16 *iptr);
+half __ovld fract(half x, __local half *iptr);
+half2 __ovld fract(half2 x, __local half2 *iptr);
+half3 __ovld fract(half3 x, __local half3 *iptr);
+half4 __ovld fract(half4 x, __local half4 *iptr);
+half8 __ovld fract(half8 x, __local half8 *iptr);
+half16 __ovld fract(half16 x, __local half16 *iptr);
+half __ovld fract(half x, __private half *iptr);
+half2 __ovld fract(half2 x, __private half2 *iptr);
+half3 __ovld fract(half3 x, __private half3 *iptr);
+half4 __ovld fract(half4 x, __private half4 *iptr);
+half8 __ovld fract(half8 x, __private half8 *iptr);
+half16 __ovld fract(half16 x, __private half16 *iptr);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Extract mantissa and exponent from x. For each
+ * component the mantissa returned is a float with
+ * magnitude in the interval [1/2, 1) or 0. Each
+ * component of x equals mantissa returned * 2^exp.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld frexp(float x, int *exp);
+float2 __ovld frexp(float2 x, int2 *exp);
+float3 __ovld frexp(float3 x, int3 *exp);
+float4 __ovld frexp(float4 x, int4 *exp);
+float8 __ovld frexp(float8 x, int8 *exp);
+float16 __ovld frexp(float16 x, int16 *exp);
+#ifdef cl_khr_fp64
+double __ovld frexp(double x, int *exp);
+double2 __ovld frexp(double2 x, int2 *exp);
+double3 __ovld frexp(double3 x, int3 *exp);
+double4 __ovld frexp(double4 x, int4 *exp);
+double8 __ovld frexp(double8 x, int8 *exp);
+double16 __ovld frexp(double16 x, int16 *exp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld frexp(half x, int *exp);
+half2 __ovld frexp(half2 x, int2 *exp);
+half3 __ovld frexp(half3 x, int3 *exp);
+half4 __ovld frexp(half4 x, int4 *exp);
+half8 __ovld frexp(half8 x, int8 *exp);
+half16 __ovld frexp(half16 x, int16 *exp);
+#endif //cl_khr_fp16
+#else
+float __ovld frexp(float x, __global int *exp);
+float2 __ovld frexp(float2 x, __global int2 *exp);
+float3 __ovld frexp(float3 x, __global int3 *exp);
+float4 __ovld frexp(float4 x, __global int4 *exp);
+float8 __ovld frexp(float8 x, __global int8 *exp);
+float16 __ovld frexp(float16 x, __global int16 *exp);
+float __ovld frexp(float x, __local int *exp);
+float2 __ovld frexp(float2 x, __local int2 *exp);
+float3 __ovld frexp(float3 x, __local int3 *exp);
+float4 __ovld frexp(float4 x, __local int4 *exp);
+float8 __ovld frexp(float8 x, __local int8 *exp);
+float16 __ovld frexp(float16 x, __local int16 *exp);
+float __ovld frexp(float x, __private int *exp);
+float2 __ovld frexp(float2 x, __private int2 *exp);
+float3 __ovld frexp(float3 x, __private int3 *exp);
+float4 __ovld frexp(float4 x, __private int4 *exp);
+float8 __ovld frexp(float8 x, __private int8 *exp);
+float16 __ovld frexp(float16 x, __private int16 *exp);
+#ifdef cl_khr_fp64
+double __ovld frexp(double x, __global int *exp);
+double2 __ovld frexp(double2 x, __global int2 *exp);
+double3 __ovld frexp(double3 x, __global int3 *exp);
+double4 __ovld frexp(double4 x, __global int4 *exp);
+double8 __ovld frexp(double8 x, __global int8 *exp);
+double16 __ovld frexp(double16 x, __global int16 *exp);
+double __ovld frexp(double x, __local int *exp);
+double2 __ovld frexp(double2 x, __local int2 *exp);
+double3 __ovld frexp(double3 x, __local int3 *exp);
+double4 __ovld frexp(double4 x, __local int4 *exp);
+double8 __ovld frexp(double8 x, __local int8 *exp);
+double16 __ovld frexp(double16 x, __local int16 *exp);
+double __ovld frexp(double x, __private int *exp);
+double2 __ovld frexp(double2 x, __private int2 *exp);
+double3 __ovld frexp(double3 x, __private int3 *exp);
+double4 __ovld frexp(double4 x, __private int4 *exp);
+double8 __ovld frexp(double8 x, __private int8 *exp);
+double16 __ovld frexp(double16 x, __private int16 *exp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld frexp(half x, __global int *exp);
+half2 __ovld frexp(half2 x, __global int2 *exp);
+half3 __ovld frexp(half3 x, __global int3 *exp);
+half4 __ovld frexp(half4 x, __global int4 *exp);
+half8 __ovld frexp(half8 x, __global int8 *exp);
+half16 __ovld frexp(half16 x, __global int16 *exp);
+half __ovld frexp(half x, __local int *exp);
+half2 __ovld frexp(half2 x, __local int2 *exp);
+half3 __ovld frexp(half3 x, __local int3 *exp);
+half4 __ovld frexp(half4 x, __local int4 *exp);
+half8 __ovld frexp(half8 x, __local int8 *exp);
+half16 __ovld frexp(half16 x, __local int16 *exp);
+half __ovld frexp(half x, __private int *exp);
+half2 __ovld frexp(half2 x, __private int2 *exp);
+half3 __ovld frexp(half3 x, __private int3 *exp);
+half4 __ovld frexp(half4 x, __private int4 *exp);
+half8 __ovld frexp(half8 x, __private int8 *exp);
+half16 __ovld frexp(half16 x, __private int16 *exp);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute the value of the square root of x^2 + y^2
+ * without undue overflow or underflow.
+ */
+float __ovld __cnfn hypot(float x, float y);
+float2 __ovld __cnfn hypot(float2 x, float2 y);
+float3 __ovld __cnfn hypot(float3 x, float3 y);
+float4 __ovld __cnfn hypot(float4 x, float4 y);
+float8 __ovld __cnfn hypot(float8 x, float8 y);
+float16 __ovld __cnfn hypot(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn hypot(double x, double y);
+double2 __ovld __cnfn hypot(double2 x, double2 y);
+double3 __ovld __cnfn hypot(double3 x, double3 y);
+double4 __ovld __cnfn hypot(double4 x, double4 y);
+double8 __ovld __cnfn hypot(double8 x, double8 y);
+double16 __ovld __cnfn hypot(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn hypot(half x, half y);
+half2 __ovld __cnfn hypot(half2 x, half2 y);
+half3 __ovld __cnfn hypot(half3 x, half3 y);
+half4 __ovld __cnfn hypot(half4 x, half4 y);
+half8 __ovld __cnfn hypot(half8 x, half8 y);
+half16 __ovld __cnfn hypot(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Return the exponent as an integer value.
+ */
+int __ovld __cnfn ilogb(float x);
+int2 __ovld __cnfn ilogb(float2 x);
+int3 __ovld __cnfn ilogb(float3 x);
+int4 __ovld __cnfn ilogb(float4 x);
+int8 __ovld __cnfn ilogb(float8 x);
+int16 __ovld __cnfn ilogb(float16 x);
+#ifdef cl_khr_fp64
+int __ovld __cnfn ilogb(double x);
+int2 __ovld __cnfn ilogb(double2 x);
+int3 __ovld __cnfn ilogb(double3 x);
+int4 __ovld __cnfn ilogb(double4 x);
+int8 __ovld __cnfn ilogb(double8 x);
+int16 __ovld __cnfn ilogb(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn ilogb(half x);
+int2 __ovld __cnfn ilogb(half2 x);
+int3 __ovld __cnfn ilogb(half3 x);
+int4 __ovld __cnfn ilogb(half4 x);
+int8 __ovld __cnfn ilogb(half8 x);
+int16 __ovld __cnfn ilogb(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Multiply x by 2 to the power n.
+ */
+float __ovld __cnfn ldexp(float x, int n);
+float2 __ovld __cnfn ldexp(float2 x, int2 n);
+float3 __ovld __cnfn ldexp(float3 x, int3 n);
+float4 __ovld __cnfn ldexp(float4 x, int4 n);
+float8 __ovld __cnfn ldexp(float8 x, int8 n);
+float16 __ovld __cnfn ldexp(float16 x, int16 n);
+float2 __ovld __cnfn ldexp(float2 x, int n);
+float3 __ovld __cnfn ldexp(float3 x, int n);
+float4 __ovld __cnfn ldexp(float4 x, int n);
+float8 __ovld __cnfn ldexp(float8 x, int n);
+float16 __ovld __cnfn ldexp(float16 x, int n);
+#ifdef cl_khr_fp64
+double __ovld __cnfn ldexp(double x, int n);
+double2 __ovld __cnfn ldexp(double2 x, int2 n);
+double3 __ovld __cnfn ldexp(double3 x, int3 n);
+double4 __ovld __cnfn ldexp(double4 x, int4 n);
+double8 __ovld __cnfn ldexp(double8 x, int8 n);
+double16 __ovld __cnfn ldexp(double16 x, int16 n);
+double2 __ovld __cnfn ldexp(double2 x, int n);
+double3 __ovld __cnfn ldexp(double3 x, int n);
+double4 __ovld __cnfn ldexp(double4 x, int n);
+double8 __ovld __cnfn ldexp(double8 x, int n);
+double16 __ovld __cnfn ldexp(double16 x, int n);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn ldexp(half x, int n);
+half2 __ovld __cnfn ldexp(half2 x, int2 n);
+half3 __ovld __cnfn ldexp(half3 x, int3 n);
+half4 __ovld __cnfn ldexp(half4 x, int4 n);
+half8 __ovld __cnfn ldexp(half8 x, int8 n);
+half16 __ovld __cnfn ldexp(half16 x, int16 n);
+half2 __ovld __cnfn ldexp(half2 x, int n);
+half3 __ovld __cnfn ldexp(half3 x, int n);
+half4 __ovld __cnfn ldexp(half4 x, int n);
+half8 __ovld __cnfn ldexp(half8 x, int n);
+half16 __ovld __cnfn ldexp(half16 x, int n);
+#endif //cl_khr_fp16
+
+/**
+ * Log gamma function. Returns the natural
+ * logarithm of the absolute value of the gamma
+ * function. The sign of the gamma function is
+ * returned in the signp argument of lgamma_r.
+ */
+float __ovld __cnfn lgamma(float x);
+float2 __ovld __cnfn lgamma(float2 x);
+float3 __ovld __cnfn lgamma(float3 x);
+float4 __ovld __cnfn lgamma(float4 x);
+float8 __ovld __cnfn lgamma(float8 x);
+float16 __ovld __cnfn lgamma(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn lgamma(double x);
+double2 __ovld __cnfn lgamma(double2 x);
+double3 __ovld __cnfn lgamma(double3 x);
+double4 __ovld __cnfn lgamma(double4 x);
+double8 __ovld __cnfn lgamma(double8 x);
+double16 __ovld __cnfn lgamma(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn lgamma(half x);
+half2 __ovld __cnfn lgamma(half2 x);
+half3 __ovld __cnfn lgamma(half3 x);
+half4 __ovld __cnfn lgamma(half4 x);
+half8 __ovld __cnfn lgamma(half8 x);
+half16 __ovld __cnfn lgamma(half16 x);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld lgamma_r(float x, int *signp);
+float2 __ovld lgamma_r(float2 x, int2 *signp);
+float3 __ovld lgamma_r(float3 x, int3 *signp);
+float4 __ovld lgamma_r(float4 x, int4 *signp);
+float8 __ovld lgamma_r(float8 x, int8 *signp);
+float16 __ovld lgamma_r(float16 x, int16 *signp);
+#ifdef cl_khr_fp64
+double __ovld lgamma_r(double x, int *signp);
+double2 __ovld lgamma_r(double2 x, int2 *signp);
+double3 __ovld lgamma_r(double3 x, int3 *signp);
+double4 __ovld lgamma_r(double4 x, int4 *signp);
+double8 __ovld lgamma_r(double8 x, int8 *signp);
+double16 __ovld lgamma_r(double16 x, int16 *signp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld lgamma_r(half x, int *signp);
+half2 __ovld lgamma_r(half2 x, int2 *signp);
+half3 __ovld lgamma_r(half3 x, int3 *signp);
+half4 __ovld lgamma_r(half4 x, int4 *signp);
+half8 __ovld lgamma_r(half8 x, int8 *signp);
+half16 __ovld lgamma_r(half16 x, int16 *signp);
+#endif //cl_khr_fp16
+#else
+float __ovld lgamma_r(float x, __global int *signp);
+float2 __ovld lgamma_r(float2 x, __global int2 *signp);
+float3 __ovld lgamma_r(float3 x, __global int3 *signp);
+float4 __ovld lgamma_r(float4 x, __global int4 *signp);
+float8 __ovld lgamma_r(float8 x, __global int8 *signp);
+float16 __ovld lgamma_r(float16 x, __global int16 *signp);
+float __ovld lgamma_r(float x, __local int *signp);
+float2 __ovld lgamma_r(float2 x, __local int2 *signp);
+float3 __ovld lgamma_r(float3 x, __local int3 *signp);
+float4 __ovld lgamma_r(float4 x, __local int4 *signp);
+float8 __ovld lgamma_r(float8 x, __local int8 *signp);
+float16 __ovld lgamma_r(float16 x, __local int16 *signp);
+float __ovld lgamma_r(float x, __private int *signp);
+float2 __ovld lgamma_r(float2 x, __private int2 *signp);
+float3 __ovld lgamma_r(float3 x, __private int3 *signp);
+float4 __ovld lgamma_r(float4 x, __private int4 *signp);
+float8 __ovld lgamma_r(float8 x, __private int8 *signp);
+float16 __ovld lgamma_r(float16 x, __private int16 *signp);
+#ifdef cl_khr_fp64
+double __ovld lgamma_r(double x, __global int *signp);
+double2 __ovld lgamma_r(double2 x, __global int2 *signp);
+double3 __ovld lgamma_r(double3 x, __global int3 *signp);
+double4 __ovld lgamma_r(double4 x, __global int4 *signp);
+double8 __ovld lgamma_r(double8 x, __global int8 *signp);
+double16 __ovld lgamma_r(double16 x, __global int16 *signp);
+double __ovld lgamma_r(double x, __local int *signp);
+double2 __ovld lgamma_r(double2 x, __local int2 *signp);
+double3 __ovld lgamma_r(double3 x, __local int3 *signp);
+double4 __ovld lgamma_r(double4 x, __local int4 *signp);
+double8 __ovld lgamma_r(double8 x, __local int8 *signp);
+double16 __ovld lgamma_r(double16 x, __local int16 *signp);
+double __ovld lgamma_r(double x, __private int *signp);
+double2 __ovld lgamma_r(double2 x, __private int2 *signp);
+double3 __ovld lgamma_r(double3 x, __private int3 *signp);
+double4 __ovld lgamma_r(double4 x, __private int4 *signp);
+double8 __ovld lgamma_r(double8 x, __private int8 *signp);
+double16 __ovld lgamma_r(double16 x, __private int16 *signp);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld lgamma_r(half x, __global int *signp);
+half2 __ovld lgamma_r(half2 x, __global int2 *signp);
+half3 __ovld lgamma_r(half3 x, __global int3 *signp);
+half4 __ovld lgamma_r(half4 x, __global int4 *signp);
+half8 __ovld lgamma_r(half8 x, __global int8 *signp);
+half16 __ovld lgamma_r(half16 x, __global int16 *signp);
+half __ovld lgamma_r(half x, __local int *signp);
+half2 __ovld lgamma_r(half2 x, __local int2 *signp);
+half3 __ovld lgamma_r(half3 x, __local int3 *signp);
+half4 __ovld lgamma_r(half4 x, __local int4 *signp);
+half8 __ovld lgamma_r(half8 x, __local int8 *signp);
+half16 __ovld lgamma_r(half16 x, __local int16 *signp);
+half __ovld lgamma_r(half x, __private int *signp);
+half2 __ovld lgamma_r(half2 x, __private int2 *signp);
+half3 __ovld lgamma_r(half3 x, __private int3 *signp);
+half4 __ovld lgamma_r(half4 x, __private int4 *signp);
+half8 __ovld lgamma_r(half8 x, __private int8 *signp);
+half16 __ovld lgamma_r(half16 x, __private int16 *signp);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute natural logarithm.
+ */
+float __ovld __cnfn log(float);
+float2 __ovld __cnfn log(float2);
+float3 __ovld __cnfn log(float3);
+float4 __ovld __cnfn log(float4);
+float8 __ovld __cnfn log(float8);
+float16 __ovld __cnfn log(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log(double);
+double2 __ovld __cnfn log(double2);
+double3 __ovld __cnfn log(double3);
+double4 __ovld __cnfn log(double4);
+double8 __ovld __cnfn log(double8);
+double16 __ovld __cnfn log(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log(half);
+half2 __ovld __cnfn log(half2);
+half3 __ovld __cnfn log(half3);
+half4 __ovld __cnfn log(half4);
+half8 __ovld __cnfn log(half8);
+half16 __ovld __cnfn log(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base 2 logarithm.
+ */
+float __ovld __cnfn log2(float);
+float2 __ovld __cnfn log2(float2);
+float3 __ovld __cnfn log2(float3);
+float4 __ovld __cnfn log2(float4);
+float8 __ovld __cnfn log2(float8);
+float16 __ovld __cnfn log2(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log2(double);
+double2 __ovld __cnfn log2(double2);
+double3 __ovld __cnfn log2(double3);
+double4 __ovld __cnfn log2(double4);
+double8 __ovld __cnfn log2(double8);
+double16 __ovld __cnfn log2(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log2(half);
+half2 __ovld __cnfn log2(half2);
+half3 __ovld __cnfn log2(half3);
+half4 __ovld __cnfn log2(half4);
+half8 __ovld __cnfn log2(half8);
+half16 __ovld __cnfn log2(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base 10 logarithm.
+ */
+float __ovld __cnfn log10(float);
+float2 __ovld __cnfn log10(float2);
+float3 __ovld __cnfn log10(float3);
+float4 __ovld __cnfn log10(float4);
+float8 __ovld __cnfn log10(float8);
+float16 __ovld __cnfn log10(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log10(double);
+double2 __ovld __cnfn log10(double2);
+double3 __ovld __cnfn log10(double3);
+double4 __ovld __cnfn log10(double4);
+double8 __ovld __cnfn log10(double8);
+double16 __ovld __cnfn log10(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log10(half);
+half2 __ovld __cnfn log10(half2);
+half3 __ovld __cnfn log10(half3);
+half4 __ovld __cnfn log10(half4);
+half8 __ovld __cnfn log10(half8);
+half16 __ovld __cnfn log10(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute a base e logarithm of (1.0 + x).
+ */
+float __ovld __cnfn log1p(float x);
+float2 __ovld __cnfn log1p(float2 x);
+float3 __ovld __cnfn log1p(float3 x);
+float4 __ovld __cnfn log1p(float4 x);
+float8 __ovld __cnfn log1p(float8 x);
+float16 __ovld __cnfn log1p(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn log1p(double x);
+double2 __ovld __cnfn log1p(double2 x);
+double3 __ovld __cnfn log1p(double3 x);
+double4 __ovld __cnfn log1p(double4 x);
+double8 __ovld __cnfn log1p(double8 x);
+double16 __ovld __cnfn log1p(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn log1p(half x);
+half2 __ovld __cnfn log1p(half2 x);
+half3 __ovld __cnfn log1p(half3 x);
+half4 __ovld __cnfn log1p(half4 x);
+half8 __ovld __cnfn log1p(half8 x);
+half16 __ovld __cnfn log1p(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the exponent of x, which is the integral
+ * part of logr | x |.
+ */
+float __ovld __cnfn logb(float x);
+float2 __ovld __cnfn logb(float2 x);
+float3 __ovld __cnfn logb(float3 x);
+float4 __ovld __cnfn logb(float4 x);
+float8 __ovld __cnfn logb(float8 x);
+float16 __ovld __cnfn logb(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn logb(double x);
+double2 __ovld __cnfn logb(double2 x);
+double3 __ovld __cnfn logb(double3 x);
+double4 __ovld __cnfn logb(double4 x);
+double8 __ovld __cnfn logb(double8 x);
+double16 __ovld __cnfn logb(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn logb(half x);
+half2 __ovld __cnfn logb(half2 x);
+half3 __ovld __cnfn logb(half3 x);
+half4 __ovld __cnfn logb(half4 x);
+half8 __ovld __cnfn logb(half8 x);
+half16 __ovld __cnfn logb(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * mad approximates a * b + c. Whether or how the
+ * product of a * b is rounded and how supernormal or
+ * subnormal intermediate products are handled is not
+ * defined. mad is intended to be used where speed is
+ * preferred over accuracy.
+ */
+float __ovld __cnfn mad(float a, float b, float c);
+float2 __ovld __cnfn mad(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn mad(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn mad(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn mad(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn mad(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn mad(double a, double b, double c);
+double2 __ovld __cnfn mad(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn mad(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn mad(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn mad(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn mad(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn mad(half a, half b, half c);
+half2 __ovld __cnfn mad(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn mad(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn mad(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn mad(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn mad(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x if | x | > | y |, y if | y | > | x |, otherwise
+ * fmax(x, y).
+ */
+float __ovld __cnfn maxmag(float x, float y);
+float2 __ovld __cnfn maxmag(float2 x, float2 y);
+float3 __ovld __cnfn maxmag(float3 x, float3 y);
+float4 __ovld __cnfn maxmag(float4 x, float4 y);
+float8 __ovld __cnfn maxmag(float8 x, float8 y);
+float16 __ovld __cnfn maxmag(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn maxmag(double x, double y);
+double2 __ovld __cnfn maxmag(double2 x, double2 y);
+double3 __ovld __cnfn maxmag(double3 x, double3 y);
+double4 __ovld __cnfn maxmag(double4 x, double4 y);
+double8 __ovld __cnfn maxmag(double8 x, double8 y);
+double16 __ovld __cnfn maxmag(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn maxmag(half x, half y);
+half2 __ovld __cnfn maxmag(half2 x, half2 y);
+half3 __ovld __cnfn maxmag(half3 x, half3 y);
+half4 __ovld __cnfn maxmag(half4 x, half4 y);
+half8 __ovld __cnfn maxmag(half8 x, half8 y);
+half16 __ovld __cnfn maxmag(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns x if | x | < | y |, y if | y | < | x |, otherwise
+ * fmin(x, y).
+ */
+float __ovld __cnfn minmag(float x, float y);
+float2 __ovld __cnfn minmag(float2 x, float2 y);
+float3 __ovld __cnfn minmag(float3 x, float3 y);
+float4 __ovld __cnfn minmag(float4 x, float4 y);
+float8 __ovld __cnfn minmag(float8 x, float8 y);
+float16 __ovld __cnfn minmag(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn minmag(double x, double y);
+double2 __ovld __cnfn minmag(double2 x, double2 y);
+double3 __ovld __cnfn minmag(double3 x, double3 y);
+double4 __ovld __cnfn minmag(double4 x, double4 y);
+double8 __ovld __cnfn minmag(double8 x, double8 y);
+double16 __ovld __cnfn minmag(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn minmag(half x, half y);
+half2 __ovld __cnfn minmag(half2 x, half2 y);
+half3 __ovld __cnfn minmag(half3 x, half3 y);
+half4 __ovld __cnfn minmag(half4 x, half4 y);
+half8 __ovld __cnfn minmag(half8 x, half8 y);
+half16 __ovld __cnfn minmag(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Decompose a floating-point number. The modf
+ * function breaks the argument x into integral and
+ * fractional parts, each of which has the same sign as
+ * the argument. It stores the integral part in the object
+ * pointed to by iptr.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld modf(float x, float *iptr);
+float2 __ovld modf(float2 x, float2 *iptr);
+float3 __ovld modf(float3 x, float3 *iptr);
+float4 __ovld modf(float4 x, float4 *iptr);
+float8 __ovld modf(float8 x, float8 *iptr);
+float16 __ovld modf(float16 x, float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld modf(double x, double *iptr);
+double2 __ovld modf(double2 x, double2 *iptr);
+double3 __ovld modf(double3 x, double3 *iptr);
+double4 __ovld modf(double4 x, double4 *iptr);
+double8 __ovld modf(double8 x, double8 *iptr);
+double16 __ovld modf(double16 x, double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld modf(half x, half *iptr);
+half2 __ovld modf(half2 x, half2 *iptr);
+half3 __ovld modf(half3 x, half3 *iptr);
+half4 __ovld modf(half4 x, half4 *iptr);
+half8 __ovld modf(half8 x, half8 *iptr);
+half16 __ovld modf(half16 x, half16 *iptr);
+#endif //cl_khr_fp16
+#else
+float __ovld modf(float x, __global float *iptr);
+float2 __ovld modf(float2 x, __global float2 *iptr);
+float3 __ovld modf(float3 x, __global float3 *iptr);
+float4 __ovld modf(float4 x, __global float4 *iptr);
+float8 __ovld modf(float8 x, __global float8 *iptr);
+float16 __ovld modf(float16 x, __global float16 *iptr);
+float __ovld modf(float x, __local float *iptr);
+float2 __ovld modf(float2 x, __local float2 *iptr);
+float3 __ovld modf(float3 x, __local float3 *iptr);
+float4 __ovld modf(float4 x, __local float4 *iptr);
+float8 __ovld modf(float8 x, __local float8 *iptr);
+float16 __ovld modf(float16 x, __local float16 *iptr);
+float __ovld modf(float x, __private float *iptr);
+float2 __ovld modf(float2 x, __private float2 *iptr);
+float3 __ovld modf(float3 x, __private float3 *iptr);
+float4 __ovld modf(float4 x, __private float4 *iptr);
+float8 __ovld modf(float8 x, __private float8 *iptr);
+float16 __ovld modf(float16 x, __private float16 *iptr);
+#ifdef cl_khr_fp64
+double __ovld modf(double x, __global double *iptr);
+double2 __ovld modf(double2 x, __global double2 *iptr);
+double3 __ovld modf(double3 x, __global double3 *iptr);
+double4 __ovld modf(double4 x, __global double4 *iptr);
+double8 __ovld modf(double8 x, __global double8 *iptr);
+double16 __ovld modf(double16 x, __global double16 *iptr);
+double __ovld modf(double x, __local double *iptr);
+double2 __ovld modf(double2 x, __local double2 *iptr);
+double3 __ovld modf(double3 x, __local double3 *iptr);
+double4 __ovld modf(double4 x, __local double4 *iptr);
+double8 __ovld modf(double8 x, __local double8 *iptr);
+double16 __ovld modf(double16 x, __local double16 *iptr);
+double __ovld modf(double x, __private double *iptr);
+double2 __ovld modf(double2 x, __private double2 *iptr);
+double3 __ovld modf(double3 x, __private double3 *iptr);
+double4 __ovld modf(double4 x, __private double4 *iptr);
+double8 __ovld modf(double8 x, __private double8 *iptr);
+double16 __ovld modf(double16 x, __private double16 *iptr);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld modf(half x, __global half *iptr);
+half2 __ovld modf(half2 x, __global half2 *iptr);
+half3 __ovld modf(half3 x, __global half3 *iptr);
+half4 __ovld modf(half4 x, __global half4 *iptr);
+half8 __ovld modf(half8 x, __global half8 *iptr);
+half16 __ovld modf(half16 x, __global half16 *iptr);
+half __ovld modf(half x, __local half *iptr);
+half2 __ovld modf(half2 x, __local half2 *iptr);
+half3 __ovld modf(half3 x, __local half3 *iptr);
+half4 __ovld modf(half4 x, __local half4 *iptr);
+half8 __ovld modf(half8 x, __local half8 *iptr);
+half16 __ovld modf(half16 x, __local half16 *iptr);
+half __ovld modf(half x, __private half *iptr);
+half2 __ovld modf(half2 x, __private half2 *iptr);
+half3 __ovld modf(half3 x, __private half3 *iptr);
+half4 __ovld modf(half4 x, __private half4 *iptr);
+half8 __ovld modf(half8 x, __private half8 *iptr);
+half16 __ovld modf(half16 x, __private half16 *iptr);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Returns a quiet NaN. The nancode may be placed
+ * in the significand of the resulting NaN.
+ */
+float __ovld __cnfn nan(uint nancode);
+float2 __ovld __cnfn nan(uint2 nancode);
+float3 __ovld __cnfn nan(uint3 nancode);
+float4 __ovld __cnfn nan(uint4 nancode);
+float8 __ovld __cnfn nan(uint8 nancode);
+float16 __ovld __cnfn nan(uint16 nancode);
+#ifdef cl_khr_fp64
+double __ovld __cnfn nan(ulong nancode);
+double2 __ovld __cnfn nan(ulong2 nancode);
+double3 __ovld __cnfn nan(ulong3 nancode);
+double4 __ovld __cnfn nan(ulong4 nancode);
+double8 __ovld __cnfn nan(ulong8 nancode);
+double16 __ovld __cnfn nan(ulong16 nancode);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn nan(ushort nancode);
+half2 __ovld __cnfn nan(ushort2 nancode);
+half3 __ovld __cnfn nan(ushort3 nancode);
+half4 __ovld __cnfn nan(ushort4 nancode);
+half8 __ovld __cnfn nan(ushort8 nancode);
+half16 __ovld __cnfn nan(ushort16 nancode);
+#endif //cl_khr_fp16
+
+/**
+ * Computes the next representable single-precision
+ * floating-point value following x in the direction of
+ * y. Thus, if y is less than x, nextafter() returns the
+ * largest representable floating-point number less
+ * than x.
+ */
+float __ovld __cnfn nextafter(float x, float y);
+float2 __ovld __cnfn nextafter(float2 x, float2 y);
+float3 __ovld __cnfn nextafter(float3 x, float3 y);
+float4 __ovld __cnfn nextafter(float4 x, float4 y);
+float8 __ovld __cnfn nextafter(float8 x, float8 y);
+float16 __ovld __cnfn nextafter(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn nextafter(double x, double y);
+double2 __ovld __cnfn nextafter(double2 x, double2 y);
+double3 __ovld __cnfn nextafter(double3 x, double3 y);
+double4 __ovld __cnfn nextafter(double4 x, double4 y);
+double8 __ovld __cnfn nextafter(double8 x, double8 y);
+double16 __ovld __cnfn nextafter(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn nextafter(half x, half y);
+half2 __ovld __cnfn nextafter(half2 x, half2 y);
+half3 __ovld __cnfn nextafter(half3 x, half3 y);
+half4 __ovld __cnfn nextafter(half4 x, half4 y);
+half8 __ovld __cnfn nextafter(half8 x, half8 y);
+half16 __ovld __cnfn nextafter(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y.
+ */
+float __ovld __cnfn pow(float x, float y);
+float2 __ovld __cnfn pow(float2 x, float2 y);
+float3 __ovld __cnfn pow(float3 x, float3 y);
+float4 __ovld __cnfn pow(float4 x, float4 y);
+float8 __ovld __cnfn pow(float8 x, float8 y);
+float16 __ovld __cnfn pow(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn pow(double x, double y);
+double2 __ovld __cnfn pow(double2 x, double2 y);
+double3 __ovld __cnfn pow(double3 x, double3 y);
+double4 __ovld __cnfn pow(double4 x, double4 y);
+double8 __ovld __cnfn pow(double8 x, double8 y);
+double16 __ovld __cnfn pow(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn pow(half x, half y);
+half2 __ovld __cnfn pow(half2 x, half2 y);
+half3 __ovld __cnfn pow(half3 x, half3 y);
+half4 __ovld __cnfn pow(half4 x, half4 y);
+half8 __ovld __cnfn pow(half8 x, half8 y);
+half16 __ovld __cnfn pow(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y, where y is an integer.
+ */
+float __ovld __cnfn pown(float x, int y);
+float2 __ovld __cnfn pown(float2 x, int2 y);
+float3 __ovld __cnfn pown(float3 x, int3 y);
+float4 __ovld __cnfn pown(float4 x, int4 y);
+float8 __ovld __cnfn pown(float8 x, int8 y);
+float16 __ovld __cnfn pown(float16 x, int16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn pown(double x, int y);
+double2 __ovld __cnfn pown(double2 x, int2 y);
+double3 __ovld __cnfn pown(double3 x, int3 y);
+double4 __ovld __cnfn pown(double4 x, int4 y);
+double8 __ovld __cnfn pown(double8 x, int8 y);
+double16 __ovld __cnfn pown(double16 x, int16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn pown(half x, int y);
+half2 __ovld __cnfn pown(half2 x, int2 y);
+half3 __ovld __cnfn pown(half3 x, int3 y);
+half4 __ovld __cnfn pown(half4 x, int4 y);
+half8 __ovld __cnfn pown(half8 x, int8 y);
+half16 __ovld __cnfn pown(half16 x, int16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power y, where x is >= 0.
+ */
+float __ovld __cnfn powr(float x, float y);
+float2 __ovld __cnfn powr(float2 x, float2 y);
+float3 __ovld __cnfn powr(float3 x, float3 y);
+float4 __ovld __cnfn powr(float4 x, float4 y);
+float8 __ovld __cnfn powr(float8 x, float8 y);
+float16 __ovld __cnfn powr(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn powr(double x, double y);
+double2 __ovld __cnfn powr(double2 x, double2 y);
+double3 __ovld __cnfn powr(double3 x, double3 y);
+double4 __ovld __cnfn powr(double4 x, double4 y);
+double8 __ovld __cnfn powr(double8 x, double8 y);
+double16 __ovld __cnfn powr(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn powr(half x, half y);
+half2 __ovld __cnfn powr(half2 x, half2 y);
+half3 __ovld __cnfn powr(half3 x, half3 y);
+half4 __ovld __cnfn powr(half4 x, half4 y);
+half8 __ovld __cnfn powr(half8 x, half8 y);
+half16 __ovld __cnfn powr(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the value r such that r = x - n*y, where n
+ * is the integer nearest the exact value of x/y. If there
+ * are two integers closest to x/y, n shall be the even
+ * one. If r is zero, it is given the same sign as x.
+ */
+float __ovld __cnfn remainder(float x, float y);
+float2 __ovld __cnfn remainder(float2 x, float2 y);
+float3 __ovld __cnfn remainder(float3 x, float3 y);
+float4 __ovld __cnfn remainder(float4 x, float4 y);
+float8 __ovld __cnfn remainder(float8 x, float8 y);
+float16 __ovld __cnfn remainder(float16 x, float16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn remainder(double x, double y);
+double2 __ovld __cnfn remainder(double2 x, double2 y);
+double3 __ovld __cnfn remainder(double3 x, double3 y);
+double4 __ovld __cnfn remainder(double4 x, double4 y);
+double8 __ovld __cnfn remainder(double8 x, double8 y);
+double16 __ovld __cnfn remainder(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn remainder(half x, half y);
+half2 __ovld __cnfn remainder(half2 x, half2 y);
+half3 __ovld __cnfn remainder(half3 x, half3 y);
+half4 __ovld __cnfn remainder(half4 x, half4 y);
+half8 __ovld __cnfn remainder(half8 x, half8 y);
+half16 __ovld __cnfn remainder(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * The remquo function computes the value r such
+ * that r = x - n*y, where n is the integer nearest the
+ * exact value of x/y. If there are two integers closest
+ * to x/y, n shall be the even one. If r is zero, it is
+ * given the same sign as x. This is the same value
+ * that is returned by the remainder function.
+ * remquo also calculates the lower seven bits of the
+ * integral quotient x/y, and gives that value the same
+ * sign as x/y. It stores this signed value in the object
+ * pointed to by quo.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld remquo(float x, float y, int *quo);
+float2 __ovld remquo(float2 x, float2 y, int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, int16 *quo);
+#ifdef cl_khr_fp64
+double __ovld remquo(double x, double y, int *quo);
+double2 __ovld remquo(double2 x, double2 y, int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, int16 *quo);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld remquo(half x, half y, int *quo);
+half2 __ovld remquo(half2 x, half2 y, int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, int16 *quo);
+
+#endif //cl_khr_fp16
+#else
+float __ovld remquo(float x, float y, __global int *quo);
+float2 __ovld remquo(float2 x, float2 y, __global int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __global int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __global int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __global int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __global int16 *quo);
+float __ovld remquo(float x, float y, __local int *quo);
+float2 __ovld remquo(float2 x, float2 y, __local int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __local int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __local int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __local int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __local int16 *quo);
+float __ovld remquo(float x, float y, __private int *quo);
+float2 __ovld remquo(float2 x, float2 y, __private int2 *quo);
+float3 __ovld remquo(float3 x, float3 y, __private int3 *quo);
+float4 __ovld remquo(float4 x, float4 y, __private int4 *quo);
+float8 __ovld remquo(float8 x, float8 y, __private int8 *quo);
+float16 __ovld remquo(float16 x, float16 y, __private int16 *quo);
+#ifdef cl_khr_fp64
+double __ovld remquo(double x, double y, __global int *quo);
+double2 __ovld remquo(double2 x, double2 y, __global int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __global int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __global int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __global int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __global int16 *quo);
+double __ovld remquo(double x, double y, __local int *quo);
+double2 __ovld remquo(double2 x, double2 y, __local int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __local int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __local int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __local int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __local int16 *quo);
+double __ovld remquo(double x, double y, __private int *quo);
+double2 __ovld remquo(double2 x, double2 y, __private int2 *quo);
+double3 __ovld remquo(double3 x, double3 y, __private int3 *quo);
+double4 __ovld remquo(double4 x, double4 y, __private int4 *quo);
+double8 __ovld remquo(double8 x, double8 y, __private int8 *quo);
+double16 __ovld remquo(double16 x, double16 y, __private int16 *quo);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld remquo(half x, half y, __global int *quo);
+half2 __ovld remquo(half2 x, half2 y, __global int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __global int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __global int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __global int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __global int16 *quo);
+half __ovld remquo(half x, half y, __local int *quo);
+half2 __ovld remquo(half2 x, half2 y, __local int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __local int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __local int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __local int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __local int16 *quo);
+half __ovld remquo(half x, half y, __private int *quo);
+half2 __ovld remquo(half2 x, half2 y, __private int2 *quo);
+half3 __ovld remquo(half3 x, half3 y, __private int3 *quo);
+half4 __ovld remquo(half4 x, half4 y, __private int4 *quo);
+half8 __ovld remquo(half8 x, half8 y, __private int8 *quo);
+half16 __ovld remquo(half16 x, half16 y, __private int16 *quo);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+/**
+ * Round to integral value (using round to nearest
+ * even rounding mode) in floating-point format.
+ * Refer to section 7.1 for description of rounding
+ * modes.
+ */
+float __ovld __cnfn rint(float);
+float2 __ovld __cnfn rint(float2);
+float3 __ovld __cnfn rint(float3);
+float4 __ovld __cnfn rint(float4);
+float8 __ovld __cnfn rint(float8);
+float16 __ovld __cnfn rint(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rint(double);
+double2 __ovld __cnfn rint(double2);
+double3 __ovld __cnfn rint(double3);
+double4 __ovld __cnfn rint(double4);
+double8 __ovld __cnfn rint(double8);
+double16 __ovld __cnfn rint(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rint(half);
+half2 __ovld __cnfn rint(half2);
+half3 __ovld __cnfn rint(half3);
+half4 __ovld __cnfn rint(half4);
+half8 __ovld __cnfn rint(half8);
+half16 __ovld __cnfn rint(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute x to the power 1/y.
+ */
+float __ovld __cnfn rootn(float x, int y);
+float2 __ovld __cnfn rootn(float2 x, int2 y);
+float3 __ovld __cnfn rootn(float3 x, int3 y);
+float4 __ovld __cnfn rootn(float4 x, int4 y);
+float8 __ovld __cnfn rootn(float8 x, int8 y);
+float16 __ovld __cnfn rootn(float16 x, int16 y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rootn(double x, int y);
+double2 __ovld __cnfn rootn(double2 x, int2 y);
+double3 __ovld __cnfn rootn(double3 x, int3 y);
+double4 __ovld __cnfn rootn(double4 x, int4 y);
+double8 __ovld __cnfn rootn(double8 x, int8 y);
+double16 __ovld __cnfn rootn(double16 x, int16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rootn(half x, int y);
+half2 __ovld __cnfn rootn(half2 x, int2 y);
+half3 __ovld __cnfn rootn(half3 x, int3 y);
+half4 __ovld __cnfn rootn(half4 x, int4 y);
+half8 __ovld __cnfn rootn(half8 x, int8 y);
+half16 __ovld __cnfn rootn(half16 x, int16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Return the integral value nearest to x rounding
+ * halfway cases away from zero, regardless of the
+ * current rounding direction.
+ */
+float __ovld __cnfn round(float x);
+float2 __ovld __cnfn round(float2 x);
+float3 __ovld __cnfn round(float3 x);
+float4 __ovld __cnfn round(float4 x);
+float8 __ovld __cnfn round(float8 x);
+float16 __ovld __cnfn round(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn round(double x);
+double2 __ovld __cnfn round(double2 x);
+double3 __ovld __cnfn round(double3 x);
+double4 __ovld __cnfn round(double4 x);
+double8 __ovld __cnfn round(double8 x);
+double16 __ovld __cnfn round(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn round(half x);
+half2 __ovld __cnfn round(half2 x);
+half3 __ovld __cnfn round(half3 x);
+half4 __ovld __cnfn round(half4 x);
+half8 __ovld __cnfn round(half8 x);
+half16 __ovld __cnfn round(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute inverse square root.
+ */
+float __ovld __cnfn rsqrt(float);
+float2 __ovld __cnfn rsqrt(float2);
+float3 __ovld __cnfn rsqrt(float3);
+float4 __ovld __cnfn rsqrt(float4);
+float8 __ovld __cnfn rsqrt(float8);
+float16 __ovld __cnfn rsqrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn rsqrt(double);
+double2 __ovld __cnfn rsqrt(double2);
+double3 __ovld __cnfn rsqrt(double3);
+double4 __ovld __cnfn rsqrt(double4);
+double8 __ovld __cnfn rsqrt(double8);
+double16 __ovld __cnfn rsqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn rsqrt(half);
+half2 __ovld __cnfn rsqrt(half2);
+half3 __ovld __cnfn rsqrt(half3);
+half4 __ovld __cnfn rsqrt(half4);
+half8 __ovld __cnfn rsqrt(half8);
+half16 __ovld __cnfn rsqrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sine.
+ */
+float __ovld __cnfn sin(float);
+float2 __ovld __cnfn sin(float2);
+float3 __ovld __cnfn sin(float3);
+float4 __ovld __cnfn sin(float4);
+float8 __ovld __cnfn sin(float8);
+float16 __ovld __cnfn sin(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sin(double);
+double2 __ovld __cnfn sin(double2);
+double3 __ovld __cnfn sin(double3);
+double4 __ovld __cnfn sin(double4);
+double8 __ovld __cnfn sin(double8);
+double16 __ovld __cnfn sin(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sin(half);
+half2 __ovld __cnfn sin(half2);
+half3 __ovld __cnfn sin(half3);
+half4 __ovld __cnfn sin(half4);
+half8 __ovld __cnfn sin(half8);
+half16 __ovld __cnfn sin(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sine and cosine of x. The computed sine
+ * is the return value and computed cosine is returned
+ * in cosval.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld sincos(float x, float *cosval);
+float2 __ovld sincos(float2 x, float2 *cosval);
+float3 __ovld sincos(float3 x, float3 *cosval);
+float4 __ovld sincos(float4 x, float4 *cosval);
+float8 __ovld sincos(float8 x, float8 *cosval);
+float16 __ovld sincos(float16 x, float16 *cosval);
+#ifdef cl_khr_fp64
+double __ovld sincos(double x, double *cosval);
+double2 __ovld sincos(double2 x, double2 *cosval);
+double3 __ovld sincos(double3 x, double3 *cosval);
+double4 __ovld sincos(double4 x, double4 *cosval);
+double8 __ovld sincos(double8 x, double8 *cosval);
+double16 __ovld sincos(double16 x, double16 *cosval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld sincos(half x, half *cosval);
+half2 __ovld sincos(half2 x, half2 *cosval);
+half3 __ovld sincos(half3 x, half3 *cosval);
+half4 __ovld sincos(half4 x, half4 *cosval);
+half8 __ovld sincos(half8 x, half8 *cosval);
+half16 __ovld sincos(half16 x, half16 *cosval);
+#endif //cl_khr_fp16
+#else
+float __ovld sincos(float x, __global float *cosval);
+float2 __ovld sincos(float2 x, __global float2 *cosval);
+float3 __ovld sincos(float3 x, __global float3 *cosval);
+float4 __ovld sincos(float4 x, __global float4 *cosval);
+float8 __ovld sincos(float8 x, __global float8 *cosval);
+float16 __ovld sincos(float16 x, __global float16 *cosval);
+float __ovld sincos(float x, __local float *cosval);
+float2 __ovld sincos(float2 x, __local float2 *cosval);
+float3 __ovld sincos(float3 x, __local float3 *cosval);
+float4 __ovld sincos(float4 x, __local float4 *cosval);
+float8 __ovld sincos(float8 x, __local float8 *cosval);
+float16 __ovld sincos(float16 x, __local float16 *cosval);
+float __ovld sincos(float x, __private float *cosval);
+float2 __ovld sincos(float2 x, __private float2 *cosval);
+float3 __ovld sincos(float3 x, __private float3 *cosval);
+float4 __ovld sincos(float4 x, __private float4 *cosval);
+float8 __ovld sincos(float8 x, __private float8 *cosval);
+float16 __ovld sincos(float16 x, __private float16 *cosval);
+#ifdef cl_khr_fp64
+double __ovld sincos(double x, __global double *cosval);
+double2 __ovld sincos(double2 x, __global double2 *cosval);
+double3 __ovld sincos(double3 x, __global double3 *cosval);
+double4 __ovld sincos(double4 x, __global double4 *cosval);
+double8 __ovld sincos(double8 x, __global double8 *cosval);
+double16 __ovld sincos(double16 x, __global double16 *cosval);
+double __ovld sincos(double x, __local double *cosval);
+double2 __ovld sincos(double2 x, __local double2 *cosval);
+double3 __ovld sincos(double3 x, __local double3 *cosval);
+double4 __ovld sincos(double4 x, __local double4 *cosval);
+double8 __ovld sincos(double8 x, __local double8 *cosval);
+double16 __ovld sincos(double16 x, __local double16 *cosval);
+double __ovld sincos(double x, __private double *cosval);
+double2 __ovld sincos(double2 x, __private double2 *cosval);
+double3 __ovld sincos(double3 x, __private double3 *cosval);
+double4 __ovld sincos(double4 x, __private double4 *cosval);
+double8 __ovld sincos(double8 x, __private double8 *cosval);
+double16 __ovld sincos(double16 x, __private double16 *cosval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld sincos(half x, __global half *cosval);
+half2 __ovld sincos(half2 x, __global half2 *cosval);
+half3 __ovld sincos(half3 x, __global half3 *cosval);
+half4 __ovld sincos(half4 x, __global half4 *cosval);
+half8 __ovld sincos(half8 x, __global half8 *cosval);
+half16 __ovld sincos(half16 x, __global half16 *cosval);
+half __ovld sincos(half x, __local half *cosval);
+half2 __ovld sincos(half2 x, __local half2 *cosval);
+half3 __ovld sincos(half3 x, __local half3 *cosval);
+half4 __ovld sincos(half4 x, __local half4 *cosval);
+half8 __ovld sincos(half8 x, __local half8 *cosval);
+half16 __ovld sincos(half16 x, __local half16 *cosval);
+half __ovld sincos(half x, __private half *cosval);
+half2 __ovld sincos(half2 x, __private half2 *cosval);
+half3 __ovld sincos(half3 x, __private half3 *cosval);
+half4 __ovld sincos(half4 x, __private half4 *cosval);
+half8 __ovld sincos(half8 x, __private half8 *cosval);
+half16 __ovld sincos(half16 x, __private half16 *cosval);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Compute hyperbolic sine.
+ */
+float __ovld __cnfn sinh(float);
+float2 __ovld __cnfn sinh(float2);
+float3 __ovld __cnfn sinh(float3);
+float4 __ovld __cnfn sinh(float4);
+float8 __ovld __cnfn sinh(float8);
+float16 __ovld __cnfn sinh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sinh(double);
+double2 __ovld __cnfn sinh(double2);
+double3 __ovld __cnfn sinh(double3);
+double4 __ovld __cnfn sinh(double4);
+double8 __ovld __cnfn sinh(double8);
+double16 __ovld __cnfn sinh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sinh(half);
+half2 __ovld __cnfn sinh(half2);
+half3 __ovld __cnfn sinh(half3);
+half4 __ovld __cnfn sinh(half4);
+half8 __ovld __cnfn sinh(half8);
+half16 __ovld __cnfn sinh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute sin (PI * x).
+ */
+float __ovld __cnfn sinpi(float x);
+float2 __ovld __cnfn sinpi(float2 x);
+float3 __ovld __cnfn sinpi(float3 x);
+float4 __ovld __cnfn sinpi(float4 x);
+float8 __ovld __cnfn sinpi(float8 x);
+float16 __ovld __cnfn sinpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sinpi(double x);
+double2 __ovld __cnfn sinpi(double2 x);
+double3 __ovld __cnfn sinpi(double3 x);
+double4 __ovld __cnfn sinpi(double4 x);
+double8 __ovld __cnfn sinpi(double8 x);
+double16 __ovld __cnfn sinpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sinpi(half x);
+half2 __ovld __cnfn sinpi(half2 x);
+half3 __ovld __cnfn sinpi(half3 x);
+half4 __ovld __cnfn sinpi(half4 x);
+half8 __ovld __cnfn sinpi(half8 x);
+half16 __ovld __cnfn sinpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute square root.
+ */
+float __ovld __cnfn sqrt(float);
+float2 __ovld __cnfn sqrt(float2);
+float3 __ovld __cnfn sqrt(float3);
+float4 __ovld __cnfn sqrt(float4);
+float8 __ovld __cnfn sqrt(float8);
+float16 __ovld __cnfn sqrt(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sqrt(double);
+double2 __ovld __cnfn sqrt(double2);
+double3 __ovld __cnfn sqrt(double3);
+double4 __ovld __cnfn sqrt(double4);
+double8 __ovld __cnfn sqrt(double8);
+double16 __ovld __cnfn sqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sqrt(half);
+half2 __ovld __cnfn sqrt(half2);
+half3 __ovld __cnfn sqrt(half3);
+half4 __ovld __cnfn sqrt(half4);
+half8 __ovld __cnfn sqrt(half8);
+half16 __ovld __cnfn sqrt(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute tangent.
+ */
+float __ovld __cnfn tan(float);
+float2 __ovld __cnfn tan(float2);
+float3 __ovld __cnfn tan(float3);
+float4 __ovld __cnfn tan(float4);
+float8 __ovld __cnfn tan(float8);
+float16 __ovld __cnfn tan(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tan(double);
+double2 __ovld __cnfn tan(double2);
+double3 __ovld __cnfn tan(double3);
+double4 __ovld __cnfn tan(double4);
+double8 __ovld __cnfn tan(double8);
+double16 __ovld __cnfn tan(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tan(half);
+half2 __ovld __cnfn tan(half2);
+half3 __ovld __cnfn tan(half3);
+half4 __ovld __cnfn tan(half4);
+half8 __ovld __cnfn tan(half8);
+half16 __ovld __cnfn tan(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute hyperbolic tangent.
+ */
+float __ovld __cnfn tanh(float);
+float2 __ovld __cnfn tanh(float2);
+float3 __ovld __cnfn tanh(float3);
+float4 __ovld __cnfn tanh(float4);
+float8 __ovld __cnfn tanh(float8);
+float16 __ovld __cnfn tanh(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tanh(double);
+double2 __ovld __cnfn tanh(double2);
+double3 __ovld __cnfn tanh(double3);
+double4 __ovld __cnfn tanh(double4);
+double8 __ovld __cnfn tanh(double8);
+double16 __ovld __cnfn tanh(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tanh(half);
+half2 __ovld __cnfn tanh(half2);
+half3 __ovld __cnfn tanh(half3);
+half4 __ovld __cnfn tanh(half4);
+half8 __ovld __cnfn tanh(half8);
+half16 __ovld __cnfn tanh(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute tan (PI * x).
+ */
+float __ovld __cnfn tanpi(float x);
+float2 __ovld __cnfn tanpi(float2 x);
+float3 __ovld __cnfn tanpi(float3 x);
+float4 __ovld __cnfn tanpi(float4 x);
+float8 __ovld __cnfn tanpi(float8 x);
+float16 __ovld __cnfn tanpi(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tanpi(double x);
+double2 __ovld __cnfn tanpi(double2 x);
+double3 __ovld __cnfn tanpi(double3 x);
+double4 __ovld __cnfn tanpi(double4 x);
+double8 __ovld __cnfn tanpi(double8 x);
+double16 __ovld __cnfn tanpi(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tanpi(half x);
+half2 __ovld __cnfn tanpi(half2 x);
+half3 __ovld __cnfn tanpi(half3 x);
+half4 __ovld __cnfn tanpi(half4 x);
+half8 __ovld __cnfn tanpi(half8 x);
+half16 __ovld __cnfn tanpi(half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Compute the gamma function.
+ */
+float __ovld __cnfn tgamma(float);
+float2 __ovld __cnfn tgamma(float2);
+float3 __ovld __cnfn tgamma(float3);
+float4 __ovld __cnfn tgamma(float4);
+float8 __ovld __cnfn tgamma(float8);
+float16 __ovld __cnfn tgamma(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn tgamma(double);
+double2 __ovld __cnfn tgamma(double2);
+double3 __ovld __cnfn tgamma(double3);
+double4 __ovld __cnfn tgamma(double4);
+double8 __ovld __cnfn tgamma(double8);
+double16 __ovld __cnfn tgamma(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn tgamma(half);
+half2 __ovld __cnfn tgamma(half2);
+half3 __ovld __cnfn tgamma(half3);
+half4 __ovld __cnfn tgamma(half4);
+half8 __ovld __cnfn tgamma(half8);
+half16 __ovld __cnfn tgamma(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Round to integral value using the round to zero
+ * rounding mode.
+ */
+float __ovld __cnfn trunc(float);
+float2 __ovld __cnfn trunc(float2);
+float3 __ovld __cnfn trunc(float3);
+float4 __ovld __cnfn trunc(float4);
+float8 __ovld __cnfn trunc(float8);
+float16 __ovld __cnfn trunc(float16);
+#ifdef cl_khr_fp64
+double __ovld __cnfn trunc(double);
+double2 __ovld __cnfn trunc(double2);
+double3 __ovld __cnfn trunc(double3);
+double4 __ovld __cnfn trunc(double4);
+double8 __ovld __cnfn trunc(double8);
+double16 __ovld __cnfn trunc(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn trunc(half);
+half2 __ovld __cnfn trunc(half2);
+half3 __ovld __cnfn trunc(half3);
+half4 __ovld __cnfn trunc(half4);
+half8 __ovld __cnfn trunc(half8);
+half16 __ovld __cnfn trunc(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Compute cosine. x must be in the range -2^16 ... +2^16.
+ */
+float __ovld __cnfn half_cos(float x);
+float2 __ovld __cnfn half_cos(float2 x);
+float3 __ovld __cnfn half_cos(float3 x);
+float4 __ovld __cnfn half_cos(float4 x);
+float8 __ovld __cnfn half_cos(float8 x);
+float16 __ovld __cnfn half_cos(float16 x);
+
+/**
+ * Compute x / y.
+ */
+float __ovld __cnfn half_divide(float x, float y);
+float2 __ovld __cnfn half_divide(float2 x, float2 y);
+float3 __ovld __cnfn half_divide(float3 x, float3 y);
+float4 __ovld __cnfn half_divide(float4 x, float4 y);
+float8 __ovld __cnfn half_divide(float8 x, float8 y);
+float16 __ovld __cnfn half_divide(float16 x, float16 y);
+
+/**
+ * Compute the base- e exponential of x.
+ */
+float __ovld __cnfn half_exp(float x);
+float2 __ovld __cnfn half_exp(float2 x);
+float3 __ovld __cnfn half_exp(float3 x);
+float4 __ovld __cnfn half_exp(float4 x);
+float8 __ovld __cnfn half_exp(float8 x);
+float16 __ovld __cnfn half_exp(float16 x);
+
+/**
+ * Compute the base- 2 exponential of x.
+ */
+float __ovld __cnfn half_exp2(float x);
+float2 __ovld __cnfn half_exp2(float2 x);
+float3 __ovld __cnfn half_exp2(float3 x);
+float4 __ovld __cnfn half_exp2(float4 x);
+float8 __ovld __cnfn half_exp2(float8 x);
+float16 __ovld __cnfn half_exp2(float16 x);
+
+/**
+ * Compute the base- 10 exponential of x.
+ */
+float __ovld __cnfn half_exp10(float x);
+float2 __ovld __cnfn half_exp10(float2 x);
+float3 __ovld __cnfn half_exp10(float3 x);
+float4 __ovld __cnfn half_exp10(float4 x);
+float8 __ovld __cnfn half_exp10(float8 x);
+float16 __ovld __cnfn half_exp10(float16 x);
+
+/**
+ * Compute natural logarithm.
+ */
+float __ovld __cnfn half_log(float x);
+float2 __ovld __cnfn half_log(float2 x);
+float3 __ovld __cnfn half_log(float3 x);
+float4 __ovld __cnfn half_log(float4 x);
+float8 __ovld __cnfn half_log(float8 x);
+float16 __ovld __cnfn half_log(float16 x);
+
+/**
+ * Compute a base 2 logarithm.
+ */
+float __ovld __cnfn half_log2(float x);
+float2 __ovld __cnfn half_log2(float2 x);
+float3 __ovld __cnfn half_log2(float3 x);
+float4 __ovld __cnfn half_log2(float4 x);
+float8 __ovld __cnfn half_log2(float8 x);
+float16 __ovld __cnfn half_log2(float16 x);
+
+/**
+ * Compute a base 10 logarithm.
+ */
+float __ovld __cnfn half_log10(float x);
+float2 __ovld __cnfn half_log10(float2 x);
+float3 __ovld __cnfn half_log10(float3 x);
+float4 __ovld __cnfn half_log10(float4 x);
+float8 __ovld __cnfn half_log10(float8 x);
+float16 __ovld __cnfn half_log10(float16 x);
+
+/**
+ * Compute x to the power y, where x is >= 0.
+ */
+float __ovld __cnfn half_powr(float x, float y);
+float2 __ovld __cnfn half_powr(float2 x, float2 y);
+float3 __ovld __cnfn half_powr(float3 x, float3 y);
+float4 __ovld __cnfn half_powr(float4 x, float4 y);
+float8 __ovld __cnfn half_powr(float8 x, float8 y);
+float16 __ovld __cnfn half_powr(float16 x, float16 y);
+
+/**
+ * Compute reciprocal.
+ */
+float __ovld __cnfn half_recip(float x);
+float2 __ovld __cnfn half_recip(float2 x);
+float3 __ovld __cnfn half_recip(float3 x);
+float4 __ovld __cnfn half_recip(float4 x);
+float8 __ovld __cnfn half_recip(float8 x);
+float16 __ovld __cnfn half_recip(float16 x);
+
+/**
+ * Compute inverse square root.
+ */
+float __ovld __cnfn half_rsqrt(float x);
+float2 __ovld __cnfn half_rsqrt(float2 x);
+float3 __ovld __cnfn half_rsqrt(float3 x);
+float4 __ovld __cnfn half_rsqrt(float4 x);
+float8 __ovld __cnfn half_rsqrt(float8 x);
+float16 __ovld __cnfn half_rsqrt(float16 x);
+
+/**
+ * Compute sine. x must be in the range -2^16 ... +2^16.
+ */
+float __ovld __cnfn half_sin(float x);
+float2 __ovld __cnfn half_sin(float2 x);
+float3 __ovld __cnfn half_sin(float3 x);
+float4 __ovld __cnfn half_sin(float4 x);
+float8 __ovld __cnfn half_sin(float8 x);
+float16 __ovld __cnfn half_sin(float16 x);
+
+/**
+ * Compute square root.
+ */
+float __ovld __cnfn half_sqrt(float x);
+float2 __ovld __cnfn half_sqrt(float2 x);
+float3 __ovld __cnfn half_sqrt(float3 x);
+float4 __ovld __cnfn half_sqrt(float4 x);
+float8 __ovld __cnfn half_sqrt(float8 x);
+float16 __ovld __cnfn half_sqrt(float16 x);
+
+/**
+ * Compute tangent. x must be in the range -216 ... +216.
+ */
+float __ovld __cnfn half_tan(float x);
+float2 __ovld __cnfn half_tan(float2 x);
+float3 __ovld __cnfn half_tan(float3 x);
+float4 __ovld __cnfn half_tan(float4 x);
+float8 __ovld __cnfn half_tan(float8 x);
+float16 __ovld __cnfn half_tan(float16 x);
+
+/**
+ * Compute cosine over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_cos(float x);
+float2 __ovld __cnfn native_cos(float2 x);
+float3 __ovld __cnfn native_cos(float3 x);
+float4 __ovld __cnfn native_cos(float4 x);
+float8 __ovld __cnfn native_cos(float8 x);
+float16 __ovld __cnfn native_cos(float16 x);
+
+/**
+ * Compute x / y over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_divide(float x, float y);
+float2 __ovld __cnfn native_divide(float2 x, float2 y);
+float3 __ovld __cnfn native_divide(float3 x, float3 y);
+float4 __ovld __cnfn native_divide(float4 x, float4 y);
+float8 __ovld __cnfn native_divide(float8 x, float8 y);
+float16 __ovld __cnfn native_divide(float16 x, float16 y);
+
+/**
+ * Compute the base- e exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp(float x);
+float2 __ovld __cnfn native_exp(float2 x);
+float3 __ovld __cnfn native_exp(float3 x);
+float4 __ovld __cnfn native_exp(float4 x);
+float8 __ovld __cnfn native_exp(float8 x);
+float16 __ovld __cnfn native_exp(float16 x);
+
+/**
+ * Compute the base- 2 exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp2(float x);
+float2 __ovld __cnfn native_exp2(float2 x);
+float3 __ovld __cnfn native_exp2(float3 x);
+float4 __ovld __cnfn native_exp2(float4 x);
+float8 __ovld __cnfn native_exp2(float8 x);
+float16 __ovld __cnfn native_exp2(float16 x);
+
+/**
+ * Compute the base- 10 exponential of x over an
+ * implementation-defined range. The maximum error is
+ * implementation-defined.
+ */
+float __ovld __cnfn native_exp10(float x);
+float2 __ovld __cnfn native_exp10(float2 x);
+float3 __ovld __cnfn native_exp10(float3 x);
+float4 __ovld __cnfn native_exp10(float4 x);
+float8 __ovld __cnfn native_exp10(float8 x);
+float16 __ovld __cnfn native_exp10(float16 x);
+
+/**
+ * Compute natural logarithm over an implementationdefined
+ * range. The maximum error is implementation
+ * defined.
+ */
+float __ovld __cnfn native_log(float x);
+float2 __ovld __cnfn native_log(float2 x);
+float3 __ovld __cnfn native_log(float3 x);
+float4 __ovld __cnfn native_log(float4 x);
+float8 __ovld __cnfn native_log(float8 x);
+float16 __ovld __cnfn native_log(float16 x);
+
+/**
+ * Compute a base 2 logarithm over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_log2(float x);
+float2 __ovld __cnfn native_log2(float2 x);
+float3 __ovld __cnfn native_log2(float3 x);
+float4 __ovld __cnfn native_log2(float4 x);
+float8 __ovld __cnfn native_log2(float8 x);
+float16 __ovld __cnfn native_log2(float16 x);
+
+/**
+ * Compute a base 10 logarithm over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_log10(float x);
+float2 __ovld __cnfn native_log10(float2 x);
+float3 __ovld __cnfn native_log10(float3 x);
+float4 __ovld __cnfn native_log10(float4 x);
+float8 __ovld __cnfn native_log10(float8 x);
+float16 __ovld __cnfn native_log10(float16 x);
+
+/**
+ * Compute x to the power y, where x is >= 0. The range of
+ * x and y are implementation-defined. The maximum error
+ * is implementation-defined.
+ */
+float __ovld __cnfn native_powr(float x, float y);
+float2 __ovld __cnfn native_powr(float2 x, float2 y);
+float3 __ovld __cnfn native_powr(float3 x, float3 y);
+float4 __ovld __cnfn native_powr(float4 x, float4 y);
+float8 __ovld __cnfn native_powr(float8 x, float8 y);
+float16 __ovld __cnfn native_powr(float16 x, float16 y);
+
+/**
+ * Compute reciprocal over an implementation-defined
+ * range. The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_recip(float x);
+float2 __ovld __cnfn native_recip(float2 x);
+float3 __ovld __cnfn native_recip(float3 x);
+float4 __ovld __cnfn native_recip(float4 x);
+float8 __ovld __cnfn native_recip(float8 x);
+float16 __ovld __cnfn native_recip(float16 x);
+
+/**
+ * Compute inverse square root over an implementationdefined
+ * range. The maximum error is implementationdefined.
+ */
+float __ovld __cnfn native_rsqrt(float x);
+float2 __ovld __cnfn native_rsqrt(float2 x);
+float3 __ovld __cnfn native_rsqrt(float3 x);
+float4 __ovld __cnfn native_rsqrt(float4 x);
+float8 __ovld __cnfn native_rsqrt(float8 x);
+float16 __ovld __cnfn native_rsqrt(float16 x);
+
+/**
+ * Compute sine over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_sin(float x);
+float2 __ovld __cnfn native_sin(float2 x);
+float3 __ovld __cnfn native_sin(float3 x);
+float4 __ovld __cnfn native_sin(float4 x);
+float8 __ovld __cnfn native_sin(float8 x);
+float16 __ovld __cnfn native_sin(float16 x);
+
+/**
+ * Compute square root over an implementation-defined
+ * range. The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_sqrt(float x);
+float2 __ovld __cnfn native_sqrt(float2 x);
+float3 __ovld __cnfn native_sqrt(float3 x);
+float4 __ovld __cnfn native_sqrt(float4 x);
+float8 __ovld __cnfn native_sqrt(float8 x);
+float16 __ovld __cnfn native_sqrt(float16 x);
+
+/**
+ * Compute tangent over an implementation-defined range.
+ * The maximum error is implementation-defined.
+ */
+float __ovld __cnfn native_tan(float x);
+float2 __ovld __cnfn native_tan(float2 x);
+float3 __ovld __cnfn native_tan(float3 x);
+float4 __ovld __cnfn native_tan(float4 x);
+float8 __ovld __cnfn native_tan(float8 x);
+float16 __ovld __cnfn native_tan(float16 x);
+
+// OpenCL v1.1 s6.11.3, v1.2 s6.12.3, v2.0 s6.13.3 - Integer Functions
+
+/**
+ * Returns | x |.
+ */
+uchar __ovld __cnfn abs(char x);
+uchar __ovld __cnfn abs(uchar x);
+uchar2 __ovld __cnfn abs(char2 x);
+uchar2 __ovld __cnfn abs(uchar2 x);
+uchar3 __ovld __cnfn abs(char3 x);
+uchar3 __ovld __cnfn abs(uchar3 x);
+uchar4 __ovld __cnfn abs(char4 x);
+uchar4 __ovld __cnfn abs(uchar4 x);
+uchar8 __ovld __cnfn abs(char8 x);
+uchar8 __ovld __cnfn abs(uchar8 x);
+uchar16 __ovld __cnfn abs(char16 x);
+uchar16 __ovld __cnfn abs(uchar16 x);
+ushort __ovld __cnfn abs(short x);
+ushort __ovld __cnfn abs(ushort x);
+ushort2 __ovld __cnfn abs(short2 x);
+ushort2 __ovld __cnfn abs(ushort2 x);
+ushort3 __ovld __cnfn abs(short3 x);
+ushort3 __ovld __cnfn abs(ushort3 x);
+ushort4 __ovld __cnfn abs(short4 x);
+ushort4 __ovld __cnfn abs(ushort4 x);
+ushort8 __ovld __cnfn abs(short8 x);
+ushort8 __ovld __cnfn abs(ushort8 x);
+ushort16 __ovld __cnfn abs(short16 x);
+ushort16 __ovld __cnfn abs(ushort16 x);
+uint __ovld __cnfn abs(int x);
+uint __ovld __cnfn abs(uint x);
+uint2 __ovld __cnfn abs(int2 x);
+uint2 __ovld __cnfn abs(uint2 x);
+uint3 __ovld __cnfn abs(int3 x);
+uint3 __ovld __cnfn abs(uint3 x);
+uint4 __ovld __cnfn abs(int4 x);
+uint4 __ovld __cnfn abs(uint4 x);
+uint8 __ovld __cnfn abs(int8 x);
+uint8 __ovld __cnfn abs(uint8 x);
+uint16 __ovld __cnfn abs(int16 x);
+uint16 __ovld __cnfn abs(uint16 x);
+ulong __ovld __cnfn abs(long x);
+ulong __ovld __cnfn abs(ulong x);
+ulong2 __ovld __cnfn abs(long2 x);
+ulong2 __ovld __cnfn abs(ulong2 x);
+ulong3 __ovld __cnfn abs(long3 x);
+ulong3 __ovld __cnfn abs(ulong3 x);
+ulong4 __ovld __cnfn abs(long4 x);
+ulong4 __ovld __cnfn abs(ulong4 x);
+ulong8 __ovld __cnfn abs(long8 x);
+ulong8 __ovld __cnfn abs(ulong8 x);
+ulong16 __ovld __cnfn abs(long16 x);
+ulong16 __ovld __cnfn abs(ulong16 x);
+
+/**
+ * Returns | x - y | without modulo overflow.
+ */
+uchar __ovld __cnfn abs_diff(char x, char y);
+uchar __ovld __cnfn abs_diff(uchar x, uchar y);
+uchar2 __ovld __cnfn abs_diff(char2 x, char2 y);
+uchar2 __ovld __cnfn abs_diff(uchar2 x, uchar2 y);
+uchar3 __ovld __cnfn abs_diff(char3 x, char3 y);
+uchar3 __ovld __cnfn abs_diff(uchar3 x, uchar3 y);
+uchar4 __ovld __cnfn abs_diff(char4 x, char4 y);
+uchar4 __ovld __cnfn abs_diff(uchar4 x, uchar4 y);
+uchar8 __ovld __cnfn abs_diff(char8 x, char8 y);
+uchar8 __ovld __cnfn abs_diff(uchar8 x, uchar8 y);
+uchar16 __ovld __cnfn abs_diff(char16 x, char16 y);
+uchar16 __ovld __cnfn abs_diff(uchar16 x, uchar16 y);
+ushort __ovld __cnfn abs_diff(short x, short y);
+ushort __ovld __cnfn abs_diff(ushort x, ushort y);
+ushort2 __ovld __cnfn abs_diff(short2 x, short2 y);
+ushort2 __ovld __cnfn abs_diff(ushort2 x, ushort2 y);
+ushort3 __ovld __cnfn abs_diff(short3 x, short3 y);
+ushort3 __ovld __cnfn abs_diff(ushort3 x, ushort3 y);
+ushort4 __ovld __cnfn abs_diff(short4 x, short4 y);
+ushort4 __ovld __cnfn abs_diff(ushort4 x, ushort4 y);
+ushort8 __ovld __cnfn abs_diff(short8 x, short8 y);
+ushort8 __ovld __cnfn abs_diff(ushort8 x, ushort8 y);
+ushort16 __ovld __cnfn abs_diff(short16 x, short16 y);
+ushort16 __ovld __cnfn abs_diff(ushort16 x, ushort16 y);
+uint __ovld __cnfn abs_diff(int x, int y);
+uint __ovld __cnfn abs_diff(uint x, uint y);
+uint2 __ovld __cnfn abs_diff(int2 x, int2 y);
+uint2 __ovld __cnfn abs_diff(uint2 x, uint2 y);
+uint3 __ovld __cnfn abs_diff(int3 x, int3 y);
+uint3 __ovld __cnfn abs_diff(uint3 x, uint3 y);
+uint4 __ovld __cnfn abs_diff(int4 x, int4 y);
+uint4 __ovld __cnfn abs_diff(uint4 x, uint4 y);
+uint8 __ovld __cnfn abs_diff(int8 x, int8 y);
+uint8 __ovld __cnfn abs_diff(uint8 x, uint8 y);
+uint16 __ovld __cnfn abs_diff(int16 x, int16 y);
+uint16 __ovld __cnfn abs_diff(uint16 x, uint16 y);
+ulong __ovld __cnfn abs_diff(long x, long y);
+ulong __ovld __cnfn abs_diff(ulong x, ulong y);
+ulong2 __ovld __cnfn abs_diff(long2 x, long2 y);
+ulong2 __ovld __cnfn abs_diff(ulong2 x, ulong2 y);
+ulong3 __ovld __cnfn abs_diff(long3 x, long3 y);
+ulong3 __ovld __cnfn abs_diff(ulong3 x, ulong3 y);
+ulong4 __ovld __cnfn abs_diff(long4 x, long4 y);
+ulong4 __ovld __cnfn abs_diff(ulong4 x, ulong4 y);
+ulong8 __ovld __cnfn abs_diff(long8 x, long8 y);
+ulong8 __ovld __cnfn abs_diff(ulong8 x, ulong8 y);
+ulong16 __ovld __cnfn abs_diff(long16 x, long16 y);
+ulong16 __ovld __cnfn abs_diff(ulong16 x, ulong16 y);
+
+/**
+ * Returns x + y and saturates the result.
+ */
+char __ovld __cnfn add_sat(char x, char y);
+uchar __ovld __cnfn add_sat(uchar x, uchar y);
+char2 __ovld __cnfn add_sat(char2 x, char2 y);
+uchar2 __ovld __cnfn add_sat(uchar2 x, uchar2 y);
+char3 __ovld __cnfn add_sat(char3 x, char3 y);
+uchar3 __ovld __cnfn add_sat(uchar3 x, uchar3 y);
+char4 __ovld __cnfn add_sat(char4 x, char4 y);
+uchar4 __ovld __cnfn add_sat(uchar4 x, uchar4 y);
+char8 __ovld __cnfn add_sat(char8 x, char8 y);
+uchar8 __ovld __cnfn add_sat(uchar8 x, uchar8 y);
+char16 __ovld __cnfn add_sat(char16 x, char16 y);
+uchar16 __ovld __cnfn add_sat(uchar16 x, uchar16 y);
+short __ovld __cnfn add_sat(short x, short y);
+ushort __ovld __cnfn add_sat(ushort x, ushort y);
+short2 __ovld __cnfn add_sat(short2 x, short2 y);
+ushort2 __ovld __cnfn add_sat(ushort2 x, ushort2 y);
+short3 __ovld __cnfn add_sat(short3 x, short3 y);
+ushort3 __ovld __cnfn add_sat(ushort3 x, ushort3 y);
+short4 __ovld __cnfn add_sat(short4 x, short4 y);
+ushort4 __ovld __cnfn add_sat(ushort4 x, ushort4 y);
+short8 __ovld __cnfn add_sat(short8 x, short8 y);
+ushort8 __ovld __cnfn add_sat(ushort8 x, ushort8 y);
+short16 __ovld __cnfn add_sat(short16 x, short16 y);
+ushort16 __ovld __cnfn add_sat(ushort16 x, ushort16 y);
+int __ovld __cnfn add_sat(int x, int y);
+uint __ovld __cnfn add_sat(uint x, uint y);
+int2 __ovld __cnfn add_sat(int2 x, int2 y);
+uint2 __ovld __cnfn add_sat(uint2 x, uint2 y);
+int3 __ovld __cnfn add_sat(int3 x, int3 y);
+uint3 __ovld __cnfn add_sat(uint3 x, uint3 y);
+int4 __ovld __cnfn add_sat(int4 x, int4 y);
+uint4 __ovld __cnfn add_sat(uint4 x, uint4 y);
+int8 __ovld __cnfn add_sat(int8 x, int8 y);
+uint8 __ovld __cnfn add_sat(uint8 x, uint8 y);
+int16 __ovld __cnfn add_sat(int16 x, int16 y);
+uint16 __ovld __cnfn add_sat(uint16 x, uint16 y);
+long __ovld __cnfn add_sat(long x, long y);
+ulong __ovld __cnfn add_sat(ulong x, ulong y);
+long2 __ovld __cnfn add_sat(long2 x, long2 y);
+ulong2 __ovld __cnfn add_sat(ulong2 x, ulong2 y);
+long3 __ovld __cnfn add_sat(long3 x, long3 y);
+ulong3 __ovld __cnfn add_sat(ulong3 x, ulong3 y);
+long4 __ovld __cnfn add_sat(long4 x, long4 y);
+ulong4 __ovld __cnfn add_sat(ulong4 x, ulong4 y);
+long8 __ovld __cnfn add_sat(long8 x, long8 y);
+ulong8 __ovld __cnfn add_sat(ulong8 x, ulong8 y);
+long16 __ovld __cnfn add_sat(long16 x, long16 y);
+ulong16 __ovld __cnfn add_sat(ulong16 x, ulong16 y);
+
+/**
+ * Returns (x + y) >> 1. The intermediate sum does
+ * not modulo overflow.
+ */
+char __ovld __cnfn hadd(char x, char y);
+uchar __ovld __cnfn hadd(uchar x, uchar y);
+char2 __ovld __cnfn hadd(char2 x, char2 y);
+uchar2 __ovld __cnfn hadd(uchar2 x, uchar2 y);
+char3 __ovld __cnfn hadd(char3 x, char3 y);
+uchar3 __ovld __cnfn hadd(uchar3 x, uchar3 y);
+char4 __ovld __cnfn hadd(char4 x, char4 y);
+uchar4 __ovld __cnfn hadd(uchar4 x, uchar4 y);
+char8 __ovld __cnfn hadd(char8 x, char8 y);
+uchar8 __ovld __cnfn hadd(uchar8 x, uchar8 y);
+char16 __ovld __cnfn hadd(char16 x, char16 y);
+uchar16 __ovld __cnfn hadd(uchar16 x, uchar16 y);
+short __ovld __cnfn hadd(short x, short y);
+ushort __ovld __cnfn hadd(ushort x, ushort y);
+short2 __ovld __cnfn hadd(short2 x, short2 y);
+ushort2 __ovld __cnfn hadd(ushort2 x, ushort2 y);
+short3 __ovld __cnfn hadd(short3 x, short3 y);
+ushort3 __ovld __cnfn hadd(ushort3 x, ushort3 y);
+short4 __ovld __cnfn hadd(short4 x, short4 y);
+ushort4 __ovld __cnfn hadd(ushort4 x, ushort4 y);
+short8 __ovld __cnfn hadd(short8 x, short8 y);
+ushort8 __ovld __cnfn hadd(ushort8 x, ushort8 y);
+short16 __ovld __cnfn hadd(short16 x, short16 y);
+ushort16 __ovld __cnfn hadd(ushort16 x, ushort16 y);
+int __ovld __cnfn hadd(int x, int y);
+uint __ovld __cnfn hadd(uint x, uint y);
+int2 __ovld __cnfn hadd(int2 x, int2 y);
+uint2 __ovld __cnfn hadd(uint2 x, uint2 y);
+int3 __ovld __cnfn hadd(int3 x, int3 y);
+uint3 __ovld __cnfn hadd(uint3 x, uint3 y);
+int4 __ovld __cnfn hadd(int4 x, int4 y);
+uint4 __ovld __cnfn hadd(uint4 x, uint4 y);
+int8 __ovld __cnfn hadd(int8 x, int8 y);
+uint8 __ovld __cnfn hadd(uint8 x, uint8 y);
+int16 __ovld __cnfn hadd(int16 x, int16 y);
+uint16 __ovld __cnfn hadd(uint16 x, uint16 y);
+long __ovld __cnfn hadd(long x, long y);
+ulong __ovld __cnfn hadd(ulong x, ulong y);
+long2 __ovld __cnfn hadd(long2 x, long2 y);
+ulong2 __ovld __cnfn hadd(ulong2 x, ulong2 y);
+long3 __ovld __cnfn hadd(long3 x, long3 y);
+ulong3 __ovld __cnfn hadd(ulong3 x, ulong3 y);
+long4 __ovld __cnfn hadd(long4 x, long4 y);
+ulong4 __ovld __cnfn hadd(ulong4 x, ulong4 y);
+long8 __ovld __cnfn hadd(long8 x, long8 y);
+ulong8 __ovld __cnfn hadd(ulong8 x, ulong8 y);
+long16 __ovld __cnfn hadd(long16 x, long16 y);
+ulong16 __ovld __cnfn hadd(ulong16 x, ulong16 y);
+
+/**
+ * Returns (x + y + 1) >> 1. The intermediate sum
+ * does not modulo overflow.
+ */
+char __ovld __cnfn rhadd(char x, char y);
+uchar __ovld __cnfn rhadd(uchar x, uchar y);
+char2 __ovld __cnfn rhadd(char2 x, char2 y);
+uchar2 __ovld __cnfn rhadd(uchar2 x, uchar2 y);
+char3 __ovld __cnfn rhadd(char3 x, char3 y);
+uchar3 __ovld __cnfn rhadd(uchar3 x, uchar3 y);
+char4 __ovld __cnfn rhadd(char4 x, char4 y);
+uchar4 __ovld __cnfn rhadd(uchar4 x, uchar4 y);
+char8 __ovld __cnfn rhadd(char8 x, char8 y);
+uchar8 __ovld __cnfn rhadd(uchar8 x, uchar8 y);
+char16 __ovld __cnfn rhadd(char16 x, char16 y);
+uchar16 __ovld __cnfn rhadd(uchar16 x, uchar16 y);
+short __ovld __cnfn rhadd(short x, short y);
+ushort __ovld __cnfn rhadd(ushort x, ushort y);
+short2 __ovld __cnfn rhadd(short2 x, short2 y);
+ushort2 __ovld __cnfn rhadd(ushort2 x, ushort2 y);
+short3 __ovld __cnfn rhadd(short3 x, short3 y);
+ushort3 __ovld __cnfn rhadd(ushort3 x, ushort3 y);
+short4 __ovld __cnfn rhadd(short4 x, short4 y);
+ushort4 __ovld __cnfn rhadd(ushort4 x, ushort4 y);
+short8 __ovld __cnfn rhadd(short8 x, short8 y);
+ushort8 __ovld __cnfn rhadd(ushort8 x, ushort8 y);
+short16 __ovld __cnfn rhadd(short16 x, short16 y);
+ushort16 __ovld __cnfn rhadd(ushort16 x, ushort16 y);
+int __ovld __cnfn rhadd(int x, int y);
+uint __ovld __cnfn rhadd(uint x, uint y);
+int2 __ovld __cnfn rhadd(int2 x, int2 y);
+uint2 __ovld __cnfn rhadd(uint2 x, uint2 y);
+int3 __ovld __cnfn rhadd(int3 x, int3 y);
+uint3 __ovld __cnfn rhadd(uint3 x, uint3 y);
+int4 __ovld __cnfn rhadd(int4 x, int4 y);
+uint4 __ovld __cnfn rhadd(uint4 x, uint4 y);
+int8 __ovld __cnfn rhadd(int8 x, int8 y);
+uint8 __ovld __cnfn rhadd(uint8 x, uint8 y);
+int16 __ovld __cnfn rhadd(int16 x, int16 y);
+uint16 __ovld __cnfn rhadd(uint16 x, uint16 y);
+long __ovld __cnfn rhadd(long x, long y);
+ulong __ovld __cnfn rhadd(ulong x, ulong y);
+long2 __ovld __cnfn rhadd(long2 x, long2 y);
+ulong2 __ovld __cnfn rhadd(ulong2 x, ulong2 y);
+long3 __ovld __cnfn rhadd(long3 x, long3 y);
+ulong3 __ovld __cnfn rhadd(ulong3 x, ulong3 y);
+long4 __ovld __cnfn rhadd(long4 x, long4 y);
+ulong4 __ovld __cnfn rhadd(ulong4 x, ulong4 y);
+long8 __ovld __cnfn rhadd(long8 x, long8 y);
+ulong8 __ovld __cnfn rhadd(ulong8 x, ulong8 y);
+long16 __ovld __cnfn rhadd(long16 x, long16 y);
+ulong16 __ovld __cnfn rhadd(ulong16 x, ulong16 y);
+
+/**
+ * Returns min(max(x, minval), maxval).
+ * Results are undefined if minval > maxval.
+ */
+char __ovld __cnfn clamp(char x, char minval, char maxval);
+uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
+char2 __ovld __cnfn clamp(char2 x, char2 minval, char2 maxval);
+uchar2 __ovld __cnfn clamp(uchar2 x, uchar2 minval, uchar2 maxval);
+char3 __ovld __cnfn clamp(char3 x, char3 minval, char3 maxval);
+uchar3 __ovld __cnfn clamp(uchar3 x, uchar3 minval, uchar3 maxval);
+char4 __ovld __cnfn clamp(char4 x, char4 minval, char4 maxval);
+uchar4 __ovld __cnfn clamp(uchar4 x, uchar4 minval, uchar4 maxval);
+char8 __ovld __cnfn clamp(char8 x, char8 minval, char8 maxval);
+uchar8 __ovld __cnfn clamp(uchar8 x, uchar8 minval, uchar8 maxval);
+char16 __ovld __cnfn clamp(char16 x, char16 minval, char16 maxval);
+uchar16 __ovld __cnfn clamp(uchar16 x, uchar16 minval, uchar16 maxval);
+short __ovld __cnfn clamp(short x, short minval, short maxval);
+ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
+short2 __ovld __cnfn clamp(short2 x, short2 minval, short2 maxval);
+ushort2 __ovld __cnfn clamp(ushort2 x, ushort2 minval, ushort2 maxval);
+short3 __ovld __cnfn clamp(short3 x, short3 minval, short3 maxval);
+ushort3 __ovld __cnfn clamp(ushort3 x, ushort3 minval, ushort3 maxval);
+short4 __ovld __cnfn clamp(short4 x, short4 minval, short4 maxval);
+ushort4 __ovld __cnfn clamp(ushort4 x, ushort4 minval, ushort4 maxval);
+short8 __ovld __cnfn clamp(short8 x, short8 minval, short8 maxval);
+ushort8 __ovld __cnfn clamp(ushort8 x, ushort8 minval, ushort8 maxval);
+short16 __ovld __cnfn clamp(short16 x, short16 minval, short16 maxval);
+ushort16 __ovld __cnfn clamp(ushort16 x, ushort16 minval, ushort16 maxval);
+int __ovld __cnfn clamp(int x, int minval, int maxval);
+uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
+int2 __ovld __cnfn clamp(int2 x, int2 minval, int2 maxval);
+uint2 __ovld __cnfn clamp(uint2 x, uint2 minval, uint2 maxval);
+int3 __ovld __cnfn clamp(int3 x, int3 minval, int3 maxval);
+uint3 __ovld __cnfn clamp(uint3 x, uint3 minval, uint3 maxval);
+int4 __ovld __cnfn clamp(int4 x, int4 minval, int4 maxval);
+uint4 __ovld __cnfn clamp(uint4 x, uint4 minval, uint4 maxval);
+int8 __ovld __cnfn clamp(int8 x, int8 minval, int8 maxval);
+uint8 __ovld __cnfn clamp(uint8 x, uint8 minval, uint8 maxval);
+int16 __ovld __cnfn clamp(int16 x, int16 minval, int16 maxval);
+uint16 __ovld __cnfn clamp(uint16 x, uint16 minval, uint16 maxval);
+long __ovld __cnfn clamp(long x, long minval, long maxval);
+ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
+long2 __ovld __cnfn clamp(long2 x, long2 minval, long2 maxval);
+ulong2 __ovld __cnfn clamp(ulong2 x, ulong2 minval, ulong2 maxval);
+long3 __ovld __cnfn clamp(long3 x, long3 minval, long3 maxval);
+ulong3 __ovld __cnfn clamp(ulong3 x, ulong3 minval, ulong3 maxval);
+long4 __ovld __cnfn clamp(long4 x, long4 minval, long4 maxval);
+ulong4 __ovld __cnfn clamp(ulong4 x, ulong4 minval, ulong4 maxval);
+long8 __ovld __cnfn clamp(long8 x, long8 minval, long8 maxval);
+ulong8 __ovld __cnfn clamp(ulong8 x, ulong8 minval, ulong8 maxval);
+long16 __ovld __cnfn clamp(long16 x, long16 minval, long16 maxval);
+ulong16 __ovld __cnfn clamp(ulong16 x, ulong16 minval, ulong16 maxval);
+char __ovld __cnfn clamp(char x, char minval, char maxval);
+uchar __ovld __cnfn clamp(uchar x, uchar minval, uchar maxval);
+char2 __ovld __cnfn clamp(char2 x, char minval, char maxval);
+uchar2 __ovld __cnfn clamp(uchar2 x, uchar minval, uchar maxval);
+char3 __ovld __cnfn clamp(char3 x, char minval, char maxval);
+uchar3 __ovld __cnfn clamp(uchar3 x, uchar minval, uchar maxval);
+char4 __ovld __cnfn clamp(char4 x, char minval, char maxval);
+uchar4 __ovld __cnfn clamp(uchar4 x, uchar minval, uchar maxval);
+char8 __ovld __cnfn clamp(char8 x, char minval, char maxval);
+uchar8 __ovld __cnfn clamp(uchar8 x, uchar minval, uchar maxval);
+char16 __ovld __cnfn clamp(char16 x, char minval, char maxval);
+uchar16 __ovld __cnfn clamp(uchar16 x, uchar minval, uchar maxval);
+short __ovld __cnfn clamp(short x, short minval, short maxval);
+ushort __ovld __cnfn clamp(ushort x, ushort minval, ushort maxval);
+short2 __ovld __cnfn clamp(short2 x, short minval, short maxval);
+ushort2 __ovld __cnfn clamp(ushort2 x, ushort minval, ushort maxval);
+short3 __ovld __cnfn clamp(short3 x, short minval, short maxval);
+ushort3 __ovld __cnfn clamp(ushort3 x, ushort minval, ushort maxval);
+short4 __ovld __cnfn clamp(short4 x, short minval, short maxval);
+ushort4 __ovld __cnfn clamp(ushort4 x, ushort minval, ushort maxval);
+short8 __ovld __cnfn clamp(short8 x, short minval, short maxval);
+ushort8 __ovld __cnfn clamp(ushort8 x, ushort minval, ushort maxval);
+short16 __ovld __cnfn clamp(short16 x, short minval, short maxval);
+ushort16 __ovld __cnfn clamp(ushort16 x, ushort minval, ushort maxval);
+int __ovld __cnfn clamp(int x, int minval, int maxval);
+uint __ovld __cnfn clamp(uint x, uint minval, uint maxval);
+int2 __ovld __cnfn clamp(int2 x, int minval, int maxval);
+uint2 __ovld __cnfn clamp(uint2 x, uint minval, uint maxval);
+int3 __ovld __cnfn clamp(int3 x, int minval, int maxval);
+uint3 __ovld __cnfn clamp(uint3 x, uint minval, uint maxval);
+int4 __ovld __cnfn clamp(int4 x, int minval, int maxval);
+uint4 __ovld __cnfn clamp(uint4 x, uint minval, uint maxval);
+int8 __ovld __cnfn clamp(int8 x, int minval, int maxval);
+uint8 __ovld __cnfn clamp(uint8 x, uint minval, uint maxval);
+int16 __ovld __cnfn clamp(int16 x, int minval, int maxval);
+uint16 __ovld __cnfn clamp(uint16 x, uint minval, uint maxval);
+long __ovld __cnfn clamp(long x, long minval, long maxval);
+ulong __ovld __cnfn clamp(ulong x, ulong minval, ulong maxval);
+long2 __ovld __cnfn clamp(long2 x, long minval, long maxval);
+ulong2 __ovld __cnfn clamp(ulong2 x, ulong minval, ulong maxval);
+long3 __ovld __cnfn clamp(long3 x, long minval, long maxval);
+ulong3 __ovld __cnfn clamp(ulong3 x, ulong minval, ulong maxval);
+long4 __ovld __cnfn clamp(long4 x, long minval, long maxval);
+ulong4 __ovld __cnfn clamp(ulong4 x, ulong minval, ulong maxval);
+long8 __ovld __cnfn clamp(long8 x, long minval, long maxval);
+ulong8 __ovld __cnfn clamp(ulong8 x, ulong minval, ulong maxval);
+long16 __ovld __cnfn clamp(long16 x, long minval, long maxval);
+ulong16 __ovld __cnfn clamp(ulong16 x, ulong minval, ulong maxval);
+
+/**
+ * Returns the number of leading 0-bits in x, starting
+ * at the most significant bit position.
+ */
+char __ovld __cnfn clz(char x);
+uchar __ovld __cnfn clz(uchar x);
+char2 __ovld __cnfn clz(char2 x);
+uchar2 __ovld __cnfn clz(uchar2 x);
+char3 __ovld __cnfn clz(char3 x);
+uchar3 __ovld __cnfn clz(uchar3 x);
+char4 __ovld __cnfn clz(char4 x);
+uchar4 __ovld __cnfn clz(uchar4 x);
+char8 __ovld __cnfn clz(char8 x);
+uchar8 __ovld __cnfn clz(uchar8 x);
+char16 __ovld __cnfn clz(char16 x);
+uchar16 __ovld __cnfn clz(uchar16 x);
+short __ovld __cnfn clz(short x);
+ushort __ovld __cnfn clz(ushort x);
+short2 __ovld __cnfn clz(short2 x);
+ushort2 __ovld __cnfn clz(ushort2 x);
+short3 __ovld __cnfn clz(short3 x);
+ushort3 __ovld __cnfn clz(ushort3 x);
+short4 __ovld __cnfn clz(short4 x);
+ushort4 __ovld __cnfn clz(ushort4 x);
+short8 __ovld __cnfn clz(short8 x);
+ushort8 __ovld __cnfn clz(ushort8 x);
+short16 __ovld __cnfn clz(short16 x);
+ushort16 __ovld __cnfn clz(ushort16 x);
+int __ovld __cnfn clz(int x);
+uint __ovld __cnfn clz(uint x);
+int2 __ovld __cnfn clz(int2 x);
+uint2 __ovld __cnfn clz(uint2 x);
+int3 __ovld __cnfn clz(int3 x);
+uint3 __ovld __cnfn clz(uint3 x);
+int4 __ovld __cnfn clz(int4 x);
+uint4 __ovld __cnfn clz(uint4 x);
+int8 __ovld __cnfn clz(int8 x);
+uint8 __ovld __cnfn clz(uint8 x);
+int16 __ovld __cnfn clz(int16 x);
+uint16 __ovld __cnfn clz(uint16 x);
+long __ovld __cnfn clz(long x);
+ulong __ovld __cnfn clz(ulong x);
+long2 __ovld __cnfn clz(long2 x);
+ulong2 __ovld __cnfn clz(ulong2 x);
+long3 __ovld __cnfn clz(long3 x);
+ulong3 __ovld __cnfn clz(ulong3 x);
+long4 __ovld __cnfn clz(long4 x);
+ulong4 __ovld __cnfn clz(ulong4 x);
+long8 __ovld __cnfn clz(long8 x);
+ulong8 __ovld __cnfn clz(ulong8 x);
+long16 __ovld __cnfn clz(long16 x);
+ulong16 __ovld __cnfn clz(ulong16 x);
+
+/**
+ * Returns the count of trailing 0-bits in x. If x is 0,
+ * returns the size in bits of the type of x or
+ * component type of x, if x is a vector.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+char __ovld ctz(char x);
+uchar __ovld ctz(uchar x);
+char2 __ovld ctz(char2 x);
+uchar2 __ovld ctz(uchar2 x);
+char3 __ovld ctz(char3 x);
+uchar3 __ovld ctz(uchar3 x);
+char4 __ovld ctz(char4 x);
+uchar4 __ovld ctz(uchar4 x);
+char8 __ovld ctz(char8 x);
+uchar8 __ovld ctz(uchar8 x);
+char16 __ovld ctz(char16 x);
+uchar16 __ovld ctz(uchar16 x);
+short __ovld ctz(short x);
+ushort __ovld ctz(ushort x);
+short2 __ovld ctz(short2 x);
+ushort2 __ovld ctz(ushort2 x);
+short3 __ovld ctz(short3 x);
+ushort3 __ovld ctz(ushort3 x);
+short4 __ovld ctz(short4 x);
+ushort4 __ovld ctz(ushort4 x);
+short8 __ovld ctz(short8 x);
+ushort8 __ovld ctz(ushort8 x);
+short16 __ovld ctz(short16 x);
+ushort16 __ovld ctz(ushort16 x);
+int __ovld ctz(int x);
+uint __ovld ctz(uint x);
+int2 __ovld ctz(int2 x);
+uint2 __ovld ctz(uint2 x);
+int3 __ovld ctz(int3 x);
+uint3 __ovld ctz(uint3 x);
+int4 __ovld ctz(int4 x);
+uint4 __ovld ctz(uint4 x);
+int8 __ovld ctz(int8 x);
+uint8 __ovld ctz(uint8 x);
+int16 __ovld ctz(int16 x);
+uint16 __ovld ctz(uint16 x);
+long __ovld ctz(long x);
+ulong __ovld ctz(ulong x);
+long2 __ovld ctz(long2 x);
+ulong2 __ovld ctz(ulong2 x);
+long3 __ovld ctz(long3 x);
+ulong3 __ovld ctz(ulong3 x);
+long4 __ovld ctz(long4 x);
+ulong4 __ovld ctz(ulong4 x);
+long8 __ovld ctz(long8 x);
+ulong8 __ovld ctz(ulong8 x);
+long16 __ovld ctz(long16 x);
+ulong16 __ovld ctz(ulong16 x);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Returns mul_hi(a, b) + c.
+ */
+char __ovld __cnfn mad_hi(char a, char b, char c);
+uchar __ovld __cnfn mad_hi(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn mad_hi(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn mad_hi(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn mad_hi(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn mad_hi(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn mad_hi(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn mad_hi(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn mad_hi(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn mad_hi(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn mad_hi(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn mad_hi(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn mad_hi(short a, short b, short c);
+ushort __ovld __cnfn mad_hi(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn mad_hi(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn mad_hi(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn mad_hi(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn mad_hi(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn mad_hi(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn mad_hi(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn mad_hi(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn mad_hi(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn mad_hi(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn mad_hi(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn mad_hi(int a, int b, int c);
+uint __ovld __cnfn mad_hi(uint a, uint b, uint c);
+int2 __ovld __cnfn mad_hi(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn mad_hi(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn mad_hi(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn mad_hi(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn mad_hi(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn mad_hi(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn mad_hi(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn mad_hi(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn mad_hi(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn mad_hi(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn mad_hi(long a, long b, long c);
+ulong __ovld __cnfn mad_hi(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn mad_hi(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn mad_hi(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn mad_hi(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn mad_hi(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn mad_hi(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn mad_hi(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn mad_hi(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn mad_hi(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn mad_hi(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn mad_hi(ulong16 a, ulong16 b, ulong16 c);
+
+/**
+ * Returns a * b + c and saturates the result.
+ */
+char __ovld __cnfn mad_sat(char a, char b, char c);
+uchar __ovld __cnfn mad_sat(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn mad_sat(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn mad_sat(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn mad_sat(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn mad_sat(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn mad_sat(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn mad_sat(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn mad_sat(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn mad_sat(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn mad_sat(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn mad_sat(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn mad_sat(short a, short b, short c);
+ushort __ovld __cnfn mad_sat(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn mad_sat(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn mad_sat(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn mad_sat(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn mad_sat(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn mad_sat(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn mad_sat(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn mad_sat(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn mad_sat(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn mad_sat(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn mad_sat(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn mad_sat(int a, int b, int c);
+uint __ovld __cnfn mad_sat(uint a, uint b, uint c);
+int2 __ovld __cnfn mad_sat(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn mad_sat(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn mad_sat(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn mad_sat(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn mad_sat(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn mad_sat(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn mad_sat(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn mad_sat(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn mad_sat(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn mad_sat(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn mad_sat(long a, long b, long c);
+ulong __ovld __cnfn mad_sat(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn mad_sat(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn mad_sat(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn mad_sat(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn mad_sat(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn mad_sat(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn mad_sat(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn mad_sat(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn mad_sat(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn mad_sat(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn mad_sat(ulong16 a, ulong16 b, ulong16 c);
+
+/**
+ * Returns y if x < y, otherwise it returns x.
+ */
+char __ovld __cnfn max(char x, char y);
+uchar __ovld __cnfn max(uchar x, uchar y);
+char2 __ovld __cnfn max(char2 x, char2 y);
+uchar2 __ovld __cnfn max(uchar2 x, uchar2 y);
+char3 __ovld __cnfn max(char3 x, char3 y);
+uchar3 __ovld __cnfn max(uchar3 x, uchar3 y);
+char4 __ovld __cnfn max(char4 x, char4 y);
+uchar4 __ovld __cnfn max(uchar4 x, uchar4 y);
+char8 __ovld __cnfn max(char8 x, char8 y);
+uchar8 __ovld __cnfn max(uchar8 x, uchar8 y);
+char16 __ovld __cnfn max(char16 x, char16 y);
+uchar16 __ovld __cnfn max(uchar16 x, uchar16 y);
+short __ovld __cnfn max(short x, short y);
+ushort __ovld __cnfn max(ushort x, ushort y);
+short2 __ovld __cnfn max(short2 x, short2 y);
+ushort2 __ovld __cnfn max(ushort2 x, ushort2 y);
+short3 __ovld __cnfn max(short3 x, short3 y);
+ushort3 __ovld __cnfn max(ushort3 x, ushort3 y);
+short4 __ovld __cnfn max(short4 x, short4 y);
+ushort4 __ovld __cnfn max(ushort4 x, ushort4 y);
+short8 __ovld __cnfn max(short8 x, short8 y);
+ushort8 __ovld __cnfn max(ushort8 x, ushort8 y);
+short16 __ovld __cnfn max(short16 x, short16 y);
+ushort16 __ovld __cnfn max(ushort16 x, ushort16 y);
+int __ovld __cnfn max(int x, int y);
+uint __ovld __cnfn max(uint x, uint y);
+int2 __ovld __cnfn max(int2 x, int2 y);
+uint2 __ovld __cnfn max(uint2 x, uint2 y);
+int3 __ovld __cnfn max(int3 x, int3 y);
+uint3 __ovld __cnfn max(uint3 x, uint3 y);
+int4 __ovld __cnfn max(int4 x, int4 y);
+uint4 __ovld __cnfn max(uint4 x, uint4 y);
+int8 __ovld __cnfn max(int8 x, int8 y);
+uint8 __ovld __cnfn max(uint8 x, uint8 y);
+int16 __ovld __cnfn max(int16 x, int16 y);
+uint16 __ovld __cnfn max(uint16 x, uint16 y);
+long __ovld __cnfn max(long x, long y);
+ulong __ovld __cnfn max(ulong x, ulong y);
+long2 __ovld __cnfn max(long2 x, long2 y);
+ulong2 __ovld __cnfn max(ulong2 x, ulong2 y);
+long3 __ovld __cnfn max(long3 x, long3 y);
+ulong3 __ovld __cnfn max(ulong3 x, ulong3 y);
+long4 __ovld __cnfn max(long4 x, long4 y);
+ulong4 __ovld __cnfn max(ulong4 x, ulong4 y);
+long8 __ovld __cnfn max(long8 x, long8 y);
+ulong8 __ovld __cnfn max(ulong8 x, ulong8 y);
+long16 __ovld __cnfn max(long16 x, long16 y);
+ulong16 __ovld __cnfn max(ulong16 x, ulong16 y);
+char __ovld __cnfn max(char x, char y);
+uchar __ovld __cnfn max(uchar x, uchar y);
+char2 __ovld __cnfn max(char2 x, char y);
+uchar2 __ovld __cnfn max(uchar2 x, uchar y);
+char3 __ovld __cnfn max(char3 x, char y);
+uchar3 __ovld __cnfn max(uchar3 x, uchar y);
+char4 __ovld __cnfn max(char4 x, char y);
+uchar4 __ovld __cnfn max(uchar4 x, uchar y);
+char8 __ovld __cnfn max(char8 x, char y);
+uchar8 __ovld __cnfn max(uchar8 x, uchar y);
+char16 __ovld __cnfn max(char16 x, char y);
+uchar16 __ovld __cnfn max(uchar16 x, uchar y);
+short __ovld __cnfn max(short x, short y);
+ushort __ovld __cnfn max(ushort x, ushort y);
+short2 __ovld __cnfn max(short2 x, short y);
+ushort2 __ovld __cnfn max(ushort2 x, ushort y);
+short3 __ovld __cnfn max(short3 x, short y);
+ushort3 __ovld __cnfn max(ushort3 x, ushort y);
+short4 __ovld __cnfn max(short4 x, short y);
+ushort4 __ovld __cnfn max(ushort4 x, ushort y);
+short8 __ovld __cnfn max(short8 x, short y);
+ushort8 __ovld __cnfn max(ushort8 x, ushort y);
+short16 __ovld __cnfn max(short16 x, short y);
+ushort16 __ovld __cnfn max(ushort16 x, ushort y);
+int __ovld __cnfn max(int x, int y);
+uint __ovld __cnfn max(uint x, uint y);
+int2 __ovld __cnfn max(int2 x, int y);
+uint2 __ovld __cnfn max(uint2 x, uint y);
+int3 __ovld __cnfn max(int3 x, int y);
+uint3 __ovld __cnfn max(uint3 x, uint y);
+int4 __ovld __cnfn max(int4 x, int y);
+uint4 __ovld __cnfn max(uint4 x, uint y);
+int8 __ovld __cnfn max(int8 x, int y);
+uint8 __ovld __cnfn max(uint8 x, uint y);
+int16 __ovld __cnfn max(int16 x, int y);
+uint16 __ovld __cnfn max(uint16 x, uint y);
+long __ovld __cnfn max(long x, long y);
+ulong __ovld __cnfn max(ulong x, ulong y);
+long2 __ovld __cnfn max(long2 x, long y);
+ulong2 __ovld __cnfn max(ulong2 x, ulong y);
+long3 __ovld __cnfn max(long3 x, long y);
+ulong3 __ovld __cnfn max(ulong3 x, ulong y);
+long4 __ovld __cnfn max(long4 x, long y);
+ulong4 __ovld __cnfn max(ulong4 x, ulong y);
+long8 __ovld __cnfn max(long8 x, long y);
+ulong8 __ovld __cnfn max(ulong8 x, ulong y);
+long16 __ovld __cnfn max(long16 x, long y);
+ulong16 __ovld __cnfn max(ulong16 x, ulong y);
+
+/**
+ * Returns y if y < x, otherwise it returns x.
+ */
+char __ovld __cnfn min(char x, char y);
+uchar __ovld __cnfn min(uchar x, uchar y);
+char2 __ovld __cnfn min(char2 x, char2 y);
+uchar2 __ovld __cnfn min(uchar2 x, uchar2 y);
+char3 __ovld __cnfn min(char3 x, char3 y);
+uchar3 __ovld __cnfn min(uchar3 x, uchar3 y);
+char4 __ovld __cnfn min(char4 x, char4 y);
+uchar4 __ovld __cnfn min(uchar4 x, uchar4 y);
+char8 __ovld __cnfn min(char8 x, char8 y);
+uchar8 __ovld __cnfn min(uchar8 x, uchar8 y);
+char16 __ovld __cnfn min(char16 x, char16 y);
+uchar16 __ovld __cnfn min(uchar16 x, uchar16 y);
+short __ovld __cnfn min(short x, short y);
+ushort __ovld __cnfn min(ushort x, ushort y);
+short2 __ovld __cnfn min(short2 x, short2 y);
+ushort2 __ovld __cnfn min(ushort2 x, ushort2 y);
+short3 __ovld __cnfn min(short3 x, short3 y);
+ushort3 __ovld __cnfn min(ushort3 x, ushort3 y);
+short4 __ovld __cnfn min(short4 x, short4 y);
+ushort4 __ovld __cnfn min(ushort4 x, ushort4 y);
+short8 __ovld __cnfn min(short8 x, short8 y);
+ushort8 __ovld __cnfn min(ushort8 x, ushort8 y);
+short16 __ovld __cnfn min(short16 x, short16 y);
+ushort16 __ovld __cnfn min(ushort16 x, ushort16 y);
+int __ovld __cnfn min(int x, int y);
+uint __ovld __cnfn min(uint x, uint y);
+int2 __ovld __cnfn min(int2 x, int2 y);
+uint2 __ovld __cnfn min(uint2 x, uint2 y);
+int3 __ovld __cnfn min(int3 x, int3 y);
+uint3 __ovld __cnfn min(uint3 x, uint3 y);
+int4 __ovld __cnfn min(int4 x, int4 y);
+uint4 __ovld __cnfn min(uint4 x, uint4 y);
+int8 __ovld __cnfn min(int8 x, int8 y);
+uint8 __ovld __cnfn min(uint8 x, uint8 y);
+int16 __ovld __cnfn min(int16 x, int16 y);
+uint16 __ovld __cnfn min(uint16 x, uint16 y);
+long __ovld __cnfn min(long x, long y);
+ulong __ovld __cnfn min(ulong x, ulong y);
+long2 __ovld __cnfn min(long2 x, long2 y);
+ulong2 __ovld __cnfn min(ulong2 x, ulong2 y);
+long3 __ovld __cnfn min(long3 x, long3 y);
+ulong3 __ovld __cnfn min(ulong3 x, ulong3 y);
+long4 __ovld __cnfn min(long4 x, long4 y);
+ulong4 __ovld __cnfn min(ulong4 x, ulong4 y);
+long8 __ovld __cnfn min(long8 x, long8 y);
+ulong8 __ovld __cnfn min(ulong8 x, ulong8 y);
+long16 __ovld __cnfn min(long16 x, long16 y);
+ulong16 __ovld __cnfn min(ulong16 x, ulong16 y);
+char __ovld __cnfn min(char x, char y);
+uchar __ovld __cnfn min(uchar x, uchar y);
+char2 __ovld __cnfn min(char2 x, char y);
+uchar2 __ovld __cnfn min(uchar2 x, uchar y);
+char3 __ovld __cnfn min(char3 x, char y);
+uchar3 __ovld __cnfn min(uchar3 x, uchar y);
+char4 __ovld __cnfn min(char4 x, char y);
+uchar4 __ovld __cnfn min(uchar4 x, uchar y);
+char8 __ovld __cnfn min(char8 x, char y);
+uchar8 __ovld __cnfn min(uchar8 x, uchar y);
+char16 __ovld __cnfn min(char16 x, char y);
+uchar16 __ovld __cnfn min(uchar16 x, uchar y);
+short __ovld __cnfn min(short x, short y);
+ushort __ovld __cnfn min(ushort x, ushort y);
+short2 __ovld __cnfn min(short2 x, short y);
+ushort2 __ovld __cnfn min(ushort2 x, ushort y);
+short3 __ovld __cnfn min(short3 x, short y);
+ushort3 __ovld __cnfn min(ushort3 x, ushort y);
+short4 __ovld __cnfn min(short4 x, short y);
+ushort4 __ovld __cnfn min(ushort4 x, ushort y);
+short8 __ovld __cnfn min(short8 x, short y);
+ushort8 __ovld __cnfn min(ushort8 x, ushort y);
+short16 __ovld __cnfn min(short16 x, short y);
+ushort16 __ovld __cnfn min(ushort16 x, ushort y);
+int __ovld __cnfn min(int x, int y);
+uint __ovld __cnfn min(uint x, uint y);
+int2 __ovld __cnfn min(int2 x, int y);
+uint2 __ovld __cnfn min(uint2 x, uint y);
+int3 __ovld __cnfn min(int3 x, int y);
+uint3 __ovld __cnfn min(uint3 x, uint y);
+int4 __ovld __cnfn min(int4 x, int y);
+uint4 __ovld __cnfn min(uint4 x, uint y);
+int8 __ovld __cnfn min(int8 x, int y);
+uint8 __ovld __cnfn min(uint8 x, uint y);
+int16 __ovld __cnfn min(int16 x, int y);
+uint16 __ovld __cnfn min(uint16 x, uint y);
+long __ovld __cnfn min(long x, long y);
+ulong __ovld __cnfn min(ulong x, ulong y);
+long2 __ovld __cnfn min(long2 x, long y);
+ulong2 __ovld __cnfn min(ulong2 x, ulong y);
+long3 __ovld __cnfn min(long3 x, long y);
+ulong3 __ovld __cnfn min(ulong3 x, ulong y);
+long4 __ovld __cnfn min(long4 x, long y);
+ulong4 __ovld __cnfn min(ulong4 x, ulong y);
+long8 __ovld __cnfn min(long8 x, long y);
+ulong8 __ovld __cnfn min(ulong8 x, ulong y);
+long16 __ovld __cnfn min(long16 x, long y);
+ulong16 __ovld __cnfn min(ulong16 x, ulong y);
+
+/**
+ * Computes x * y and returns the high half of the
+ * product of x and y.
+ */
+char __ovld __cnfn mul_hi(char x, char y);
+uchar __ovld __cnfn mul_hi(uchar x, uchar y);
+char2 __ovld __cnfn mul_hi(char2 x, char2 y);
+uchar2 __ovld __cnfn mul_hi(uchar2 x, uchar2 y);
+char3 __ovld __cnfn mul_hi(char3 x, char3 y);
+uchar3 __ovld __cnfn mul_hi(uchar3 x, uchar3 y);
+char4 __ovld __cnfn mul_hi(char4 x, char4 y);
+uchar4 __ovld __cnfn mul_hi(uchar4 x, uchar4 y);
+char8 __ovld __cnfn mul_hi(char8 x, char8 y);
+uchar8 __ovld __cnfn mul_hi(uchar8 x, uchar8 y);
+char16 __ovld __cnfn mul_hi(char16 x, char16 y);
+uchar16 __ovld __cnfn mul_hi(uchar16 x, uchar16 y);
+short __ovld __cnfn mul_hi(short x, short y);
+ushort __ovld __cnfn mul_hi(ushort x, ushort y);
+short2 __ovld __cnfn mul_hi(short2 x, short2 y);
+ushort2 __ovld __cnfn mul_hi(ushort2 x, ushort2 y);
+short3 __ovld __cnfn mul_hi(short3 x, short3 y);
+ushort3 __ovld __cnfn mul_hi(ushort3 x, ushort3 y);
+short4 __ovld __cnfn mul_hi(short4 x, short4 y);
+ushort4 __ovld __cnfn mul_hi(ushort4 x, ushort4 y);
+short8 __ovld __cnfn mul_hi(short8 x, short8 y);
+ushort8 __ovld __cnfn mul_hi(ushort8 x, ushort8 y);
+short16 __ovld __cnfn mul_hi(short16 x, short16 y);
+ushort16 __ovld __cnfn mul_hi(ushort16 x, ushort16 y);
+int __ovld __cnfn mul_hi(int x, int y);
+uint __ovld __cnfn mul_hi(uint x, uint y);
+int2 __ovld __cnfn mul_hi(int2 x, int2 y);
+uint2 __ovld __cnfn mul_hi(uint2 x, uint2 y);
+int3 __ovld __cnfn mul_hi(int3 x, int3 y);
+uint3 __ovld __cnfn mul_hi(uint3 x, uint3 y);
+int4 __ovld __cnfn mul_hi(int4 x, int4 y);
+uint4 __ovld __cnfn mul_hi(uint4 x, uint4 y);
+int8 __ovld __cnfn mul_hi(int8 x, int8 y);
+uint8 __ovld __cnfn mul_hi(uint8 x, uint8 y);
+int16 __ovld __cnfn mul_hi(int16 x, int16 y);
+uint16 __ovld __cnfn mul_hi(uint16 x, uint16 y);
+long __ovld __cnfn mul_hi(long x, long y);
+ulong __ovld __cnfn mul_hi(ulong x, ulong y);
+long2 __ovld __cnfn mul_hi(long2 x, long2 y);
+ulong2 __ovld __cnfn mul_hi(ulong2 x, ulong2 y);
+long3 __ovld __cnfn mul_hi(long3 x, long3 y);
+ulong3 __ovld __cnfn mul_hi(ulong3 x, ulong3 y);
+long4 __ovld __cnfn mul_hi(long4 x, long4 y);
+ulong4 __ovld __cnfn mul_hi(ulong4 x, ulong4 y);
+long8 __ovld __cnfn mul_hi(long8 x, long8 y);
+ulong8 __ovld __cnfn mul_hi(ulong8 x, ulong8 y);
+long16 __ovld __cnfn mul_hi(long16 x, long16 y);
+ulong16 __ovld __cnfn mul_hi(ulong16 x, ulong16 y);
+
+/**
+ * For each element in v, the bits are shifted left by
+ * the number of bits given by the corresponding
+ * element in i (subject to usual shift modulo rules
+ * described in section 6.3). Bits shifted off the left
+ * side of the element are shifted back in from the
+ * right.
+ */
+char __ovld __cnfn rotate(char v, char i);
+uchar __ovld __cnfn rotate(uchar v, uchar i);
+char2 __ovld __cnfn rotate(char2 v, char2 i);
+uchar2 __ovld __cnfn rotate(uchar2 v, uchar2 i);
+char3 __ovld __cnfn rotate(char3 v, char3 i);
+uchar3 __ovld __cnfn rotate(uchar3 v, uchar3 i);
+char4 __ovld __cnfn rotate(char4 v, char4 i);
+uchar4 __ovld __cnfn rotate(uchar4 v, uchar4 i);
+char8 __ovld __cnfn rotate(char8 v, char8 i);
+uchar8 __ovld __cnfn rotate(uchar8 v, uchar8 i);
+char16 __ovld __cnfn rotate(char16 v, char16 i);
+uchar16 __ovld __cnfn rotate(uchar16 v, uchar16 i);
+short __ovld __cnfn rotate(short v, short i);
+ushort __ovld __cnfn rotate(ushort v, ushort i);
+short2 __ovld __cnfn rotate(short2 v, short2 i);
+ushort2 __ovld __cnfn rotate(ushort2 v, ushort2 i);
+short3 __ovld __cnfn rotate(short3 v, short3 i);
+ushort3 __ovld __cnfn rotate(ushort3 v, ushort3 i);
+short4 __ovld __cnfn rotate(short4 v, short4 i);
+ushort4 __ovld __cnfn rotate(ushort4 v, ushort4 i);
+short8 __ovld __cnfn rotate(short8 v, short8 i);
+ushort8 __ovld __cnfn rotate(ushort8 v, ushort8 i);
+short16 __ovld __cnfn rotate(short16 v, short16 i);
+ushort16 __ovld __cnfn rotate(ushort16 v, ushort16 i);
+int __ovld __cnfn rotate(int v, int i);
+uint __ovld __cnfn rotate(uint v, uint i);
+int2 __ovld __cnfn rotate(int2 v, int2 i);
+uint2 __ovld __cnfn rotate(uint2 v, uint2 i);
+int3 __ovld __cnfn rotate(int3 v, int3 i);
+uint3 __ovld __cnfn rotate(uint3 v, uint3 i);
+int4 __ovld __cnfn rotate(int4 v, int4 i);
+uint4 __ovld __cnfn rotate(uint4 v, uint4 i);
+int8 __ovld __cnfn rotate(int8 v, int8 i);
+uint8 __ovld __cnfn rotate(uint8 v, uint8 i);
+int16 __ovld __cnfn rotate(int16 v, int16 i);
+uint16 __ovld __cnfn rotate(uint16 v, uint16 i);
+long __ovld __cnfn rotate(long v, long i);
+ulong __ovld __cnfn rotate(ulong v, ulong i);
+long2 __ovld __cnfn rotate(long2 v, long2 i);
+ulong2 __ovld __cnfn rotate(ulong2 v, ulong2 i);
+long3 __ovld __cnfn rotate(long3 v, long3 i);
+ulong3 __ovld __cnfn rotate(ulong3 v, ulong3 i);
+long4 __ovld __cnfn rotate(long4 v, long4 i);
+ulong4 __ovld __cnfn rotate(ulong4 v, ulong4 i);
+long8 __ovld __cnfn rotate(long8 v, long8 i);
+ulong8 __ovld __cnfn rotate(ulong8 v, ulong8 i);
+long16 __ovld __cnfn rotate(long16 v, long16 i);
+ulong16 __ovld __cnfn rotate(ulong16 v, ulong16 i);
+
+/**
+ * Returns x - y and saturates the result.
+ */
+char __ovld __cnfn sub_sat(char x, char y);
+uchar __ovld __cnfn sub_sat(uchar x, uchar y);
+char2 __ovld __cnfn sub_sat(char2 x, char2 y);
+uchar2 __ovld __cnfn sub_sat(uchar2 x, uchar2 y);
+char3 __ovld __cnfn sub_sat(char3 x, char3 y);
+uchar3 __ovld __cnfn sub_sat(uchar3 x, uchar3 y);
+char4 __ovld __cnfn sub_sat(char4 x, char4 y);
+uchar4 __ovld __cnfn sub_sat(uchar4 x, uchar4 y);
+char8 __ovld __cnfn sub_sat(char8 x, char8 y);
+uchar8 __ovld __cnfn sub_sat(uchar8 x, uchar8 y);
+char16 __ovld __cnfn sub_sat(char16 x, char16 y);
+uchar16 __ovld __cnfn sub_sat(uchar16 x, uchar16 y);
+short __ovld __cnfn sub_sat(short x, short y);
+ushort __ovld __cnfn sub_sat(ushort x, ushort y);
+short2 __ovld __cnfn sub_sat(short2 x, short2 y);
+ushort2 __ovld __cnfn sub_sat(ushort2 x, ushort2 y);
+short3 __ovld __cnfn sub_sat(short3 x, short3 y);
+ushort3 __ovld __cnfn sub_sat(ushort3 x, ushort3 y);
+short4 __ovld __cnfn sub_sat(short4 x, short4 y);
+ushort4 __ovld __cnfn sub_sat(ushort4 x, ushort4 y);
+short8 __ovld __cnfn sub_sat(short8 x, short8 y);
+ushort8 __ovld __cnfn sub_sat(ushort8 x, ushort8 y);
+short16 __ovld __cnfn sub_sat(short16 x, short16 y);
+ushort16 __ovld __cnfn sub_sat(ushort16 x, ushort16 y);
+int __ovld __cnfn sub_sat(int x, int y);
+uint __ovld __cnfn sub_sat(uint x, uint y);
+int2 __ovld __cnfn sub_sat(int2 x, int2 y);
+uint2 __ovld __cnfn sub_sat(uint2 x, uint2 y);
+int3 __ovld __cnfn sub_sat(int3 x, int3 y);
+uint3 __ovld __cnfn sub_sat(uint3 x, uint3 y);
+int4 __ovld __cnfn sub_sat(int4 x, int4 y);
+uint4 __ovld __cnfn sub_sat(uint4 x, uint4 y);
+int8 __ovld __cnfn sub_sat(int8 x, int8 y);
+uint8 __ovld __cnfn sub_sat(uint8 x, uint8 y);
+int16 __ovld __cnfn sub_sat(int16 x, int16 y);
+uint16 __ovld __cnfn sub_sat(uint16 x, uint16 y);
+long __ovld __cnfn sub_sat(long x, long y);
+ulong __ovld __cnfn sub_sat(ulong x, ulong y);
+long2 __ovld __cnfn sub_sat(long2 x, long2 y);
+ulong2 __ovld __cnfn sub_sat(ulong2 x, ulong2 y);
+long3 __ovld __cnfn sub_sat(long3 x, long3 y);
+ulong3 __ovld __cnfn sub_sat(ulong3 x, ulong3 y);
+long4 __ovld __cnfn sub_sat(long4 x, long4 y);
+ulong4 __ovld __cnfn sub_sat(ulong4 x, ulong4 y);
+long8 __ovld __cnfn sub_sat(long8 x, long8 y);
+ulong8 __ovld __cnfn sub_sat(ulong8 x, ulong8 y);
+long16 __ovld __cnfn sub_sat(long16 x, long16 y);
+ulong16 __ovld __cnfn sub_sat(ulong16 x, ulong16 y);
+
+/**
+ * result[i] = ((short)hi[i] << 8) | lo[i]
+ * result[i] = ((ushort)hi[i] << 8) | lo[i]
+ */
+short __ovld __cnfn upsample(char hi, uchar lo);
+ushort __ovld __cnfn upsample(uchar hi, uchar lo);
+short2 __ovld __cnfn upsample(char2 hi, uchar2 lo);
+short3 __ovld __cnfn upsample(char3 hi, uchar3 lo);
+short4 __ovld __cnfn upsample(char4 hi, uchar4 lo);
+short8 __ovld __cnfn upsample(char8 hi, uchar8 lo);
+short16 __ovld __cnfn upsample(char16 hi, uchar16 lo);
+ushort2 __ovld __cnfn upsample(uchar2 hi, uchar2 lo);
+ushort3 __ovld __cnfn upsample(uchar3 hi, uchar3 lo);
+ushort4 __ovld __cnfn upsample(uchar4 hi, uchar4 lo);
+ushort8 __ovld __cnfn upsample(uchar8 hi, uchar8 lo);
+ushort16 __ovld __cnfn upsample(uchar16 hi, uchar16 lo);
+
+/**
+ * result[i] = ((int)hi[i] << 16) | lo[i]
+ * result[i] = ((uint)hi[i] << 16) | lo[i]
+ */
+int __ovld __cnfn upsample(short hi, ushort lo);
+uint __ovld __cnfn upsample(ushort hi, ushort lo);
+int2 __ovld __cnfn upsample(short2 hi, ushort2 lo);
+int3 __ovld __cnfn upsample(short3 hi, ushort3 lo);
+int4 __ovld __cnfn upsample(short4 hi, ushort4 lo);
+int8 __ovld __cnfn upsample(short8 hi, ushort8 lo);
+int16 __ovld __cnfn upsample(short16 hi, ushort16 lo);
+uint2 __ovld __cnfn upsample(ushort2 hi, ushort2 lo);
+uint3 __ovld __cnfn upsample(ushort3 hi, ushort3 lo);
+uint4 __ovld __cnfn upsample(ushort4 hi, ushort4 lo);
+uint8 __ovld __cnfn upsample(ushort8 hi, ushort8 lo);
+uint16 __ovld __cnfn upsample(ushort16 hi, ushort16 lo);
+/**
+ * result[i] = ((long)hi[i] << 32) | lo[i]
+ * result[i] = ((ulong)hi[i] << 32) | lo[i]
+ */
+long __ovld __cnfn upsample(int hi, uint lo);
+ulong __ovld __cnfn upsample(uint hi, uint lo);
+long2 __ovld __cnfn upsample(int2 hi, uint2 lo);
+long3 __ovld __cnfn upsample(int3 hi, uint3 lo);
+long4 __ovld __cnfn upsample(int4 hi, uint4 lo);
+long8 __ovld __cnfn upsample(int8 hi, uint8 lo);
+long16 __ovld __cnfn upsample(int16 hi, uint16 lo);
+ulong2 __ovld __cnfn upsample(uint2 hi, uint2 lo);
+ulong3 __ovld __cnfn upsample(uint3 hi, uint3 lo);
+ulong4 __ovld __cnfn upsample(uint4 hi, uint4 lo);
+ulong8 __ovld __cnfn upsample(uint8 hi, uint8 lo);
+ulong16 __ovld __cnfn upsample(uint16 hi, uint16 lo);
+
+/*
+ * popcount(x): returns the number of set bit in x
+ */
+char __ovld __cnfn popcount(char x);
+uchar __ovld __cnfn popcount(uchar x);
+char2 __ovld __cnfn popcount(char2 x);
+uchar2 __ovld __cnfn popcount(uchar2 x);
+char3 __ovld __cnfn popcount(char3 x);
+uchar3 __ovld __cnfn popcount(uchar3 x);
+char4 __ovld __cnfn popcount(char4 x);
+uchar4 __ovld __cnfn popcount(uchar4 x);
+char8 __ovld __cnfn popcount(char8 x);
+uchar8 __ovld __cnfn popcount(uchar8 x);
+char16 __ovld __cnfn popcount(char16 x);
+uchar16 __ovld __cnfn popcount(uchar16 x);
+short __ovld __cnfn popcount(short x);
+ushort __ovld __cnfn popcount(ushort x);
+short2 __ovld __cnfn popcount(short2 x);
+ushort2 __ovld __cnfn popcount(ushort2 x);
+short3 __ovld __cnfn popcount(short3 x);
+ushort3 __ovld __cnfn popcount(ushort3 x);
+short4 __ovld __cnfn popcount(short4 x);
+ushort4 __ovld __cnfn popcount(ushort4 x);
+short8 __ovld __cnfn popcount(short8 x);
+ushort8 __ovld __cnfn popcount(ushort8 x);
+short16 __ovld __cnfn popcount(short16 x);
+ushort16 __ovld __cnfn popcount(ushort16 x);
+int __ovld __cnfn popcount(int x);
+uint __ovld __cnfn popcount(uint x);
+int2 __ovld __cnfn popcount(int2 x);
+uint2 __ovld __cnfn popcount(uint2 x);
+int3 __ovld __cnfn popcount(int3 x);
+uint3 __ovld __cnfn popcount(uint3 x);
+int4 __ovld __cnfn popcount(int4 x);
+uint4 __ovld __cnfn popcount(uint4 x);
+int8 __ovld __cnfn popcount(int8 x);
+uint8 __ovld __cnfn popcount(uint8 x);
+int16 __ovld __cnfn popcount(int16 x);
+uint16 __ovld __cnfn popcount(uint16 x);
+long __ovld __cnfn popcount(long x);
+ulong __ovld __cnfn popcount(ulong x);
+long2 __ovld __cnfn popcount(long2 x);
+ulong2 __ovld __cnfn popcount(ulong2 x);
+long3 __ovld __cnfn popcount(long3 x);
+ulong3 __ovld __cnfn popcount(ulong3 x);
+long4 __ovld __cnfn popcount(long4 x);
+ulong4 __ovld __cnfn popcount(ulong4 x);
+long8 __ovld __cnfn popcount(long8 x);
+ulong8 __ovld __cnfn popcount(ulong8 x);
+long16 __ovld __cnfn popcount(long16 x);
+ulong16 __ovld __cnfn popcount(ulong16 x);
+
+/**
+ * Multiply two 24-bit integer values x and y and add
+ * the 32-bit integer result to the 32-bit integer z.
+ * Refer to definition of mul24 to see how the 24-bit
+ * integer multiplication is performed.
+ */
+int __ovld __cnfn mad24(int x, int y, int z);
+uint __ovld __cnfn mad24(uint x, uint y, uint z);
+int2 __ovld __cnfn mad24(int2 x, int2 y, int2 z);
+uint2 __ovld __cnfn mad24(uint2 x, uint2 y, uint2 z);
+int3 __ovld __cnfn mad24(int3 x, int3 y, int3 z);
+uint3 __ovld __cnfn mad24(uint3 x, uint3 y, uint3 z);
+int4 __ovld __cnfn mad24(int4 x, int4 y, int4 z);
+uint4 __ovld __cnfn mad24(uint4 x, uint4 y, uint4 z);
+int8 __ovld __cnfn mad24(int8 x, int8 y, int8 z);
+uint8 __ovld __cnfn mad24(uint8 x, uint8 y, uint8 z);
+int16 __ovld __cnfn mad24(int16 x, int16 y, int16 z);
+uint16 __ovld __cnfn mad24(uint16 x, uint16 y, uint16 z);
+
+/**
+ * Multiply two 24-bit integer values x and y. x and y
+ * are 32-bit integers but only the low 24-bits are used
+ * to perform the multiplication. mul24 should only
+ * be used when values in x and y are in the range [-
+ * 2^23, 2^23-1] if x and y are signed integers and in the
+ * range [0, 2^24-1] if x and y are unsigned integers. If
+ * x and y are not in this range, the multiplication
+ * result is implementation-defined.
+ */
+int __ovld __cnfn mul24(int x, int y);
+uint __ovld __cnfn mul24(uint x, uint y);
+int2 __ovld __cnfn mul24(int2 x, int2 y);
+uint2 __ovld __cnfn mul24(uint2 x, uint2 y);
+int3 __ovld __cnfn mul24(int3 x, int3 y);
+uint3 __ovld __cnfn mul24(uint3 x, uint3 y);
+int4 __ovld __cnfn mul24(int4 x, int4 y);
+uint4 __ovld __cnfn mul24(uint4 x, uint4 y);
+int8 __ovld __cnfn mul24(int8 x, int8 y);
+uint8 __ovld __cnfn mul24(uint8 x, uint8 y);
+int16 __ovld __cnfn mul24(int16 x, int16 y);
+uint16 __ovld __cnfn mul24(uint16 x, uint16 y);
+
+// OpenCL v1.1 s6.11.4, v1.2 s6.12.4, v2.0 s6.13.4 - Common Functions
+
+/**
+ * Returns fmin(fmax(x, minval), maxval).
+ * Results are undefined if minval > maxval.
+ */
+float __ovld __cnfn clamp(float x, float minval, float maxval);
+float2 __ovld __cnfn clamp(float2 x, float2 minval, float2 maxval);
+float3 __ovld __cnfn clamp(float3 x, float3 minval, float3 maxval);
+float4 __ovld __cnfn clamp(float4 x, float4 minval, float4 maxval);
+float8 __ovld __cnfn clamp(float8 x, float8 minval, float8 maxval);
+float16 __ovld __cnfn clamp(float16 x, float16 minval, float16 maxval);
+float2 __ovld __cnfn clamp(float2 x, float minval, float maxval);
+float3 __ovld __cnfn clamp(float3 x, float minval, float maxval);
+float4 __ovld __cnfn clamp(float4 x, float minval, float maxval);
+float8 __ovld __cnfn clamp(float8 x, float minval, float maxval);
+float16 __ovld __cnfn clamp(float16 x, float minval, float maxval);
+#ifdef cl_khr_fp64
+double __ovld __cnfn clamp(double x, double minval, double maxval);
+double2 __ovld __cnfn clamp(double2 x, double2 minval, double2 maxval);
+double3 __ovld __cnfn clamp(double3 x, double3 minval, double3 maxval);
+double4 __ovld __cnfn clamp(double4 x, double4 minval, double4 maxval);
+double8 __ovld __cnfn clamp(double8 x, double8 minval, double8 maxval);
+double16 __ovld __cnfn clamp(double16 x, double16 minval, double16 maxval);
+double2 __ovld __cnfn clamp(double2 x, double minval, double maxval);
+double3 __ovld __cnfn clamp(double3 x, double minval, double maxval);
+double4 __ovld __cnfn clamp(double4 x, double minval, double maxval);
+double8 __ovld __cnfn clamp(double8 x, double minval, double maxval);
+double16 __ovld __cnfn clamp(double16 x, double minval, double maxval);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn clamp(half x, half minval, half maxval);
+half2 __ovld __cnfn clamp(half2 x, half2 minval, half2 maxval);
+half3 __ovld __cnfn clamp(half3 x, half3 minval, half3 maxval);
+half4 __ovld __cnfn clamp(half4 x, half4 minval, half4 maxval);
+half8 __ovld __cnfn clamp(half8 x, half8 minval, half8 maxval);
+half16 __ovld __cnfn clamp(half16 x, half16 minval, half16 maxval);
+half2 __ovld __cnfn clamp(half2 x, half minval, half maxval);
+half3 __ovld __cnfn clamp(half3 x, half minval, half maxval);
+half4 __ovld __cnfn clamp(half4 x, half minval, half maxval);
+half8 __ovld __cnfn clamp(half8 x, half minval, half maxval);
+half16 __ovld __cnfn clamp(half16 x, half minval, half maxval);
+#endif //cl_khr_fp16
+
+/**
+ * Converts radians to degrees, i.e. (180 / PI) *
+ * radians.
+ */
+float __ovld __cnfn degrees(float radians);
+float2 __ovld __cnfn degrees(float2 radians);
+float3 __ovld __cnfn degrees(float3 radians);
+float4 __ovld __cnfn degrees(float4 radians);
+float8 __ovld __cnfn degrees(float8 radians);
+float16 __ovld __cnfn degrees(float16 radians);
+#ifdef cl_khr_fp64
+double __ovld __cnfn degrees(double radians);
+double2 __ovld __cnfn degrees(double2 radians);
+double3 __ovld __cnfn degrees(double3 radians);
+double4 __ovld __cnfn degrees(double4 radians);
+double8 __ovld __cnfn degrees(double8 radians);
+double16 __ovld __cnfn degrees(double16 radians);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn degrees(half radians);
+half2 __ovld __cnfn degrees(half2 radians);
+half3 __ovld __cnfn degrees(half3 radians);
+half4 __ovld __cnfn degrees(half4 radians);
+half8 __ovld __cnfn degrees(half8 radians);
+half16 __ovld __cnfn degrees(half16 radians);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if x < y, otherwise it returns x. If x and y
+ * are infinite or NaN, the return values are undefined.
+ */
+float __ovld __cnfn max(float x, float y);
+float2 __ovld __cnfn max(float2 x, float2 y);
+float3 __ovld __cnfn max(float3 x, float3 y);
+float4 __ovld __cnfn max(float4 x, float4 y);
+float8 __ovld __cnfn max(float8 x, float8 y);
+float16 __ovld __cnfn max(float16 x, float16 y);
+float2 __ovld __cnfn max(float2 x, float y);
+float3 __ovld __cnfn max(float3 x, float y);
+float4 __ovld __cnfn max(float4 x, float y);
+float8 __ovld __cnfn max(float8 x, float y);
+float16 __ovld __cnfn max(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn max(double x, double y);
+double2 __ovld __cnfn max(double2 x, double2 y);
+double3 __ovld __cnfn max(double3 x, double3 y);
+double4 __ovld __cnfn max(double4 x, double4 y);
+double8 __ovld __cnfn max(double8 x, double8 y);
+double16 __ovld __cnfn max(double16 x, double16 y);
+double2 __ovld __cnfn max(double2 x, double y);
+double3 __ovld __cnfn max(double3 x, double y);
+double4 __ovld __cnfn max(double4 x, double y);
+double8 __ovld __cnfn max(double8 x, double y);
+double16 __ovld __cnfn max(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn max(half x, half y);
+half2 __ovld __cnfn max(half2 x, half2 y);
+half3 __ovld __cnfn max(half3 x, half3 y);
+half4 __ovld __cnfn max(half4 x, half4 y);
+half8 __ovld __cnfn max(half8 x, half8 y);
+half16 __ovld __cnfn max(half16 x, half16 y);
+half2 __ovld __cnfn max(half2 x, half y);
+half3 __ovld __cnfn max(half3 x, half y);
+half4 __ovld __cnfn max(half4 x, half y);
+half8 __ovld __cnfn max(half8 x, half y);
+half16 __ovld __cnfn max(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns y if y < x, otherwise it returns x. If x and y
+ * are infinite or NaN, the return values are undefined.
+ */
+float __ovld __cnfn min(float x, float y);
+float2 __ovld __cnfn min(float2 x, float2 y);
+float3 __ovld __cnfn min(float3 x, float3 y);
+float4 __ovld __cnfn min(float4 x, float4 y);
+float8 __ovld __cnfn min(float8 x, float8 y);
+float16 __ovld __cnfn min(float16 x, float16 y);
+float2 __ovld __cnfn min(float2 x, float y);
+float3 __ovld __cnfn min(float3 x, float y);
+float4 __ovld __cnfn min(float4 x, float y);
+float8 __ovld __cnfn min(float8 x, float y);
+float16 __ovld __cnfn min(float16 x, float y);
+#ifdef cl_khr_fp64
+double __ovld __cnfn min(double x, double y);
+double2 __ovld __cnfn min(double2 x, double2 y);
+double3 __ovld __cnfn min(double3 x, double3 y);
+double4 __ovld __cnfn min(double4 x, double4 y);
+double8 __ovld __cnfn min(double8 x, double8 y);
+double16 __ovld __cnfn min(double16 x, double16 y);
+double2 __ovld __cnfn min(double2 x, double y);
+double3 __ovld __cnfn min(double3 x, double y);
+double4 __ovld __cnfn min(double4 x, double y);
+double8 __ovld __cnfn min(double8 x, double y);
+double16 __ovld __cnfn min(double16 x, double y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn min(half x, half y);
+half2 __ovld __cnfn min(half2 x, half2 y);
+half3 __ovld __cnfn min(half3 x, half3 y);
+half4 __ovld __cnfn min(half4 x, half4 y);
+half8 __ovld __cnfn min(half8 x, half8 y);
+half16 __ovld __cnfn min(half16 x, half16 y);
+half2 __ovld __cnfn min(half2 x, half y);
+half3 __ovld __cnfn min(half3 x, half y);
+half4 __ovld __cnfn min(half4 x, half y);
+half8 __ovld __cnfn min(half8 x, half y);
+half16 __ovld __cnfn min(half16 x, half y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the linear blend of x & y implemented as:
+ * x + (y - x) * a
+ * a must be a value in the range 0.0 ... 1.0. If a is not
+ * in the range 0.0 ... 1.0, the return values are
+ * undefined.
+ */
+float __ovld __cnfn mix(float x, float y, float a);
+float2 __ovld __cnfn mix(float2 x, float2 y, float2 a);
+float3 __ovld __cnfn mix(float3 x, float3 y, float3 a);
+float4 __ovld __cnfn mix(float4 x, float4 y, float4 a);
+float8 __ovld __cnfn mix(float8 x, float8 y, float8 a);
+float16 __ovld __cnfn mix(float16 x, float16 y, float16 a);
+float2 __ovld __cnfn mix(float2 x, float2 y, float a);
+float3 __ovld __cnfn mix(float3 x, float3 y, float a);
+float4 __ovld __cnfn mix(float4 x, float4 y, float a);
+float8 __ovld __cnfn mix(float8 x, float8 y, float a);
+float16 __ovld __cnfn mix(float16 x, float16 y, float a);
+#ifdef cl_khr_fp64
+double __ovld __cnfn mix(double x, double y, double a);
+double2 __ovld __cnfn mix(double2 x, double2 y, double2 a);
+double3 __ovld __cnfn mix(double3 x, double3 y, double3 a);
+double4 __ovld __cnfn mix(double4 x, double4 y, double4 a);
+double8 __ovld __cnfn mix(double8 x, double8 y, double8 a);
+double16 __ovld __cnfn mix(double16 x, double16 y, double16 a);
+double2 __ovld __cnfn mix(double2 x, double2 y, double a);
+double3 __ovld __cnfn mix(double3 x, double3 y, double a);
+double4 __ovld __cnfn mix(double4 x, double4 y, double a);
+double8 __ovld __cnfn mix(double8 x, double8 y, double a);
+double16 __ovld __cnfn mix(double16 x, double16 y, double a);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn mix(half x, half y, half a);
+half2 __ovld __cnfn mix(half2 x, half2 y, half2 a);
+half3 __ovld __cnfn mix(half3 x, half3 y, half3 a);
+half4 __ovld __cnfn mix(half4 x, half4 y, half4 a);
+half8 __ovld __cnfn mix(half8 x, half8 y, half8 a);
+half16 __ovld __cnfn mix(half16 x, half16 y, half16 a);
+half2 __ovld __cnfn mix(half2 x, half2 y, half a);
+half3 __ovld __cnfn mix(half3 x, half3 y, half a);
+half4 __ovld __cnfn mix(half4 x, half4 y, half a);
+half8 __ovld __cnfn mix(half8 x, half8 y, half a);
+half16 __ovld __cnfn mix(half16 x, half16 y, half a);
+#endif //cl_khr_fp16
+
+/**
+ * Converts degrees to radians, i.e. (PI / 180) *
+ * degrees.
+ */
+float __ovld __cnfn radians(float degrees);
+float2 __ovld __cnfn radians(float2 degrees);
+float3 __ovld __cnfn radians(float3 degrees);
+float4 __ovld __cnfn radians(float4 degrees);
+float8 __ovld __cnfn radians(float8 degrees);
+float16 __ovld __cnfn radians(float16 degrees);
+#ifdef cl_khr_fp64
+double __ovld __cnfn radians(double degrees);
+double2 __ovld __cnfn radians(double2 degrees);
+double3 __ovld __cnfn radians(double3 degrees);
+double4 __ovld __cnfn radians(double4 degrees);
+double8 __ovld __cnfn radians(double8 degrees);
+double16 __ovld __cnfn radians(double16 degrees);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn radians(half degrees);
+half2 __ovld __cnfn radians(half2 degrees);
+half3 __ovld __cnfn radians(half3 degrees);
+half4 __ovld __cnfn radians(half4 degrees);
+half8 __ovld __cnfn radians(half8 degrees);
+half16 __ovld __cnfn radians(half16 degrees);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 0.0 if x < edge, otherwise it returns 1.0.
+ */
+float __ovld __cnfn step(float edge, float x);
+float2 __ovld __cnfn step(float2 edge, float2 x);
+float3 __ovld __cnfn step(float3 edge, float3 x);
+float4 __ovld __cnfn step(float4 edge, float4 x);
+float8 __ovld __cnfn step(float8 edge, float8 x);
+float16 __ovld __cnfn step(float16 edge, float16 x);
+float2 __ovld __cnfn step(float edge, float2 x);
+float3 __ovld __cnfn step(float edge, float3 x);
+float4 __ovld __cnfn step(float edge, float4 x);
+float8 __ovld __cnfn step(float edge, float8 x);
+float16 __ovld __cnfn step(float edge, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn step(double edge, double x);
+double2 __ovld __cnfn step(double2 edge, double2 x);
+double3 __ovld __cnfn step(double3 edge, double3 x);
+double4 __ovld __cnfn step(double4 edge, double4 x);
+double8 __ovld __cnfn step(double8 edge, double8 x);
+double16 __ovld __cnfn step(double16 edge, double16 x);
+double2 __ovld __cnfn step(double edge, double2 x);
+double3 __ovld __cnfn step(double edge, double3 x);
+double4 __ovld __cnfn step(double edge, double4 x);
+double8 __ovld __cnfn step(double edge, double8 x);
+double16 __ovld __cnfn step(double edge, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn step(half edge, half x);
+half2 __ovld __cnfn step(half2 edge, half2 x);
+half3 __ovld __cnfn step(half3 edge, half3 x);
+half4 __ovld __cnfn step(half4 edge, half4 x);
+half8 __ovld __cnfn step(half8 edge, half8 x);
+half16 __ovld __cnfn step(half16 edge, half16 x);
+half __ovld __cnfn step(half edge, half x);
+half2 __ovld __cnfn step(half edge, half2 x);
+half3 __ovld __cnfn step(half edge, half3 x);
+half4 __ovld __cnfn step(half edge, half4 x);
+half8 __ovld __cnfn step(half edge, half8 x);
+half16 __ovld __cnfn step(half edge, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 0.0 if x <= edge0 and 1.0 if x >= edge1 and
+ * performs smooth Hermite interpolation between 0
+ * and 1when edge0 < x < edge1. This is useful in
+ * cases where you would want a threshold function
+ * with a smooth transition.
+ * This is equivalent to:
+ * gentype t;
+ * t = clamp ((x - edge0) / (edge1 - edge0), 0, 1);
+ * return t * t * (3 - 2 * t);
+ * Results are undefined if edge0 >= edge1 or if x,
+ * edge0 or edge1 is a NaN.
+ */
+float __ovld __cnfn smoothstep(float edge0, float edge1, float x);
+float2 __ovld __cnfn smoothstep(float2 edge0, float2 edge1, float2 x);
+float3 __ovld __cnfn smoothstep(float3 edge0, float3 edge1, float3 x);
+float4 __ovld __cnfn smoothstep(float4 edge0, float4 edge1, float4 x);
+float8 __ovld __cnfn smoothstep(float8 edge0, float8 edge1, float8 x);
+float16 __ovld __cnfn smoothstep(float16 edge0, float16 edge1, float16 x);
+float2 __ovld __cnfn smoothstep(float edge0, float edge1, float2 x);
+float3 __ovld __cnfn smoothstep(float edge0, float edge1, float3 x);
+float4 __ovld __cnfn smoothstep(float edge0, float edge1, float4 x);
+float8 __ovld __cnfn smoothstep(float edge0, float edge1, float8 x);
+float16 __ovld __cnfn smoothstep(float edge0, float edge1, float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn smoothstep(double edge0, double edge1, double x);
+double2 __ovld __cnfn smoothstep(double2 edge0, double2 edge1, double2 x);
+double3 __ovld __cnfn smoothstep(double3 edge0, double3 edge1, double3 x);
+double4 __ovld __cnfn smoothstep(double4 edge0, double4 edge1, double4 x);
+double8 __ovld __cnfn smoothstep(double8 edge0, double8 edge1, double8 x);
+double16 __ovld __cnfn smoothstep(double16 edge0, double16 edge1, double16 x);
+double2 __ovld __cnfn smoothstep(double edge0, double edge1, double2 x);
+double3 __ovld __cnfn smoothstep(double edge0, double edge1, double3 x);
+double4 __ovld __cnfn smoothstep(double edge0, double edge1, double4 x);
+double8 __ovld __cnfn smoothstep(double edge0, double edge1, double8 x);
+double16 __ovld __cnfn smoothstep(double edge0, double edge1, double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
+half2 __ovld __cnfn smoothstep(half2 edge0, half2 edge1, half2 x);
+half3 __ovld __cnfn smoothstep(half3 edge0, half3 edge1, half3 x);
+half4 __ovld __cnfn smoothstep(half4 edge0, half4 edge1, half4 x);
+half8 __ovld __cnfn smoothstep(half8 edge0, half8 edge1, half8 x);
+half16 __ovld __cnfn smoothstep(half16 edge0, half16 edge1, half16 x);
+half __ovld __cnfn smoothstep(half edge0, half edge1, half x);
+half2 __ovld __cnfn smoothstep(half edge0, half edge1, half2 x);
+half3 __ovld __cnfn smoothstep(half edge0, half edge1, half3 x);
+half4 __ovld __cnfn smoothstep(half edge0, half edge1, half4 x);
+half8 __ovld __cnfn smoothstep(half edge0, half edge1, half8 x);
+half16 __ovld __cnfn smoothstep(half edge0, half edge1, half16 x);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 1.0 if x > 0, -0.0 if x = -0.0, +0.0 if x =
+ * +0.0, or -1.0 if x < 0. Returns 0.0 if x is a NaN.
+ */
+float __ovld __cnfn sign(float x);
+float2 __ovld __cnfn sign(float2 x);
+float3 __ovld __cnfn sign(float3 x);
+float4 __ovld __cnfn sign(float4 x);
+float8 __ovld __cnfn sign(float8 x);
+float16 __ovld __cnfn sign(float16 x);
+#ifdef cl_khr_fp64
+double __ovld __cnfn sign(double x);
+double2 __ovld __cnfn sign(double2 x);
+double3 __ovld __cnfn sign(double3 x);
+double4 __ovld __cnfn sign(double4 x);
+double8 __ovld __cnfn sign(double8 x);
+double16 __ovld __cnfn sign(double16 x);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sign(half x);
+half2 __ovld __cnfn sign(half2 x);
+half3 __ovld __cnfn sign(half3 x);
+half4 __ovld __cnfn sign(half4 x);
+half8 __ovld __cnfn sign(half8 x);
+half16 __ovld __cnfn sign(half16 x);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.5, v1.2 s6.12.5, v2.0 s6.13.5 - Geometric Functions
+
+/**
+ * Returns the cross product of p0.xyz and p1.xyz. The
+ * w component of float4 result returned will be 0.0.
+ */
+float4 __ovld __cnfn cross(float4 p0, float4 p1);
+float3 __ovld __cnfn cross(float3 p0, float3 p1);
+#ifdef cl_khr_fp64
+double4 __ovld __cnfn cross(double4 p0, double4 p1);
+double3 __ovld __cnfn cross(double3 p0, double3 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half4 __ovld __cnfn cross(half4 p0, half4 p1);
+half3 __ovld __cnfn cross(half3 p0, half3 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Compute dot product.
+ */
+float __ovld __cnfn dot(float p0, float p1);
+float __ovld __cnfn dot(float2 p0, float2 p1);
+float __ovld __cnfn dot(float3 p0, float3 p1);
+float __ovld __cnfn dot(float4 p0, float4 p1);
+#ifdef cl_khr_fp64
+double __ovld __cnfn dot(double p0, double p1);
+double __ovld __cnfn dot(double2 p0, double2 p1);
+double __ovld __cnfn dot(double3 p0, double3 p1);
+double __ovld __cnfn dot(double4 p0, double4 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn dot(half p0, half p1);
+half __ovld __cnfn dot(half2 p0, half2 p1);
+half __ovld __cnfn dot(half3 p0, half3 p1);
+half __ovld __cnfn dot(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the distance between p0 and p1. This is
+ * calculated as length(p0 - p1).
+ */
+float __ovld __cnfn distance(float p0, float p1);
+float __ovld __cnfn distance(float2 p0, float2 p1);
+float __ovld __cnfn distance(float3 p0, float3 p1);
+float __ovld __cnfn distance(float4 p0, float4 p1);
+#ifdef cl_khr_fp64
+double __ovld __cnfn distance(double p0, double p1);
+double __ovld __cnfn distance(double2 p0, double2 p1);
+double __ovld __cnfn distance(double3 p0, double3 p1);
+double __ovld __cnfn distance(double4 p0, double4 p1);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn distance(half p0, half p1);
+half __ovld __cnfn distance(half2 p0, half2 p1);
+half __ovld __cnfn distance(half3 p0, half3 p1);
+half __ovld __cnfn distance(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Return the length of vector p, i.e.,
+ * sqrt(p.x2 + p.y 2 + ...)
+ */
+float __ovld __cnfn length(float p);
+float __ovld __cnfn length(float2 p);
+float __ovld __cnfn length(float3 p);
+float __ovld __cnfn length(float4 p);
+#ifdef cl_khr_fp64
+double __ovld __cnfn length(double p);
+double __ovld __cnfn length(double2 p);
+double __ovld __cnfn length(double3 p);
+double __ovld __cnfn length(double4 p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn length(half p);
+half __ovld __cnfn length(half2 p);
+half __ovld __cnfn length(half3 p);
+half __ovld __cnfn length(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns a vector in the same direction as p but with a
+ * length of 1.
+ */
+float __ovld __cnfn normalize(float p);
+float2 __ovld __cnfn normalize(float2 p);
+float3 __ovld __cnfn normalize(float3 p);
+float4 __ovld __cnfn normalize(float4 p);
+#ifdef cl_khr_fp64
+double __ovld __cnfn normalize(double p);
+double2 __ovld __cnfn normalize(double2 p);
+double3 __ovld __cnfn normalize(double3 p);
+double4 __ovld __cnfn normalize(double4 p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn normalize(half p);
+half2 __ovld __cnfn normalize(half2 p);
+half3 __ovld __cnfn normalize(half3 p);
+half4 __ovld __cnfn normalize(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns fast_length(p0 - p1).
+ */
+float __ovld __cnfn fast_distance(float p0, float p1);
+float __ovld __cnfn fast_distance(float2 p0, float2 p1);
+float __ovld __cnfn fast_distance(float3 p0, float3 p1);
+float __ovld __cnfn fast_distance(float4 p0, float4 p1);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_distance(half p0, half p1);
+half __ovld __cnfn fast_distance(half2 p0, half2 p1);
+half __ovld __cnfn fast_distance(half3 p0, half3 p1);
+half __ovld __cnfn fast_distance(half4 p0, half4 p1);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the length of vector p computed as:
+ * half_sqrt(p.x2 + p.y2 + ...)
+ */
+float __ovld __cnfn fast_length(float p);
+float __ovld __cnfn fast_length(float2 p);
+float __ovld __cnfn fast_length(float3 p);
+float __ovld __cnfn fast_length(float4 p);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_length(half p);
+half __ovld __cnfn fast_length(half2 p);
+half __ovld __cnfn fast_length(half3 p);
+half __ovld __cnfn fast_length(half4 p);
+#endif //cl_khr_fp16
+
+/**
+ * Returns a vector in the same direction as p but with a
+ * length of 1. fast_normalize is computed as:
+ * p * half_rsqrt (p.x^2 + p.y^2 + ... )
+ * The result shall be within 8192 ulps error from the
+ * infinitely precise result of
+ * if (all(p == 0.0f))
+ * result = p;
+ * else
+ * result = p / sqrt (p.x^2 + p.y^2 + ...);
+ * with the following exceptions:
+ * 1) If the sum of squares is greater than FLT_MAX
+ * then the value of the floating-point values in the
+ * result vector are undefined.
+ * 2) If the sum of squares is less than FLT_MIN then
+ * the implementation may return back p.
+ * 3) If the device is in "denorms are flushed to zero"
+ * mode, individual operand elements with magnitude
+ * less than sqrt(FLT_MIN) may be flushed to zero
+ * before proceeding with the calculation.
+ */
+float __ovld __cnfn fast_normalize(float p);
+float2 __ovld __cnfn fast_normalize(float2 p);
+float3 __ovld __cnfn fast_normalize(float3 p);
+float4 __ovld __cnfn fast_normalize(float4 p);
+#ifdef cl_khr_fp16
+half __ovld __cnfn fast_normalize(half p);
+half2 __ovld __cnfn fast_normalize(half2 p);
+half3 __ovld __cnfn fast_normalize(half3 p);
+half4 __ovld __cnfn fast_normalize(half4 p);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.6, v1.2 s6.12.6, v2.0 s6.13.6 - Relational Functions
+
+/**
+ * intn isequal (floatn x, floatn y)
+ * Returns the component-wise compare of x == y.
+ */
+int __ovld __cnfn isequal(float x, float y);
+int2 __ovld __cnfn isequal(float2 x, float2 y);
+int3 __ovld __cnfn isequal(float3 x, float3 y);
+int4 __ovld __cnfn isequal(float4 x, float4 y);
+int8 __ovld __cnfn isequal(float8 x, float8 y);
+int16 __ovld __cnfn isequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isequal(double x, double y);
+long2 __ovld __cnfn isequal(double2 x, double2 y);
+long3 __ovld __cnfn isequal(double3 x, double3 y);
+long4 __ovld __cnfn isequal(double4 x, double4 y);
+long8 __ovld __cnfn isequal(double8 x, double8 y);
+long16 __ovld __cnfn isequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isequal(half x, half y);
+short2 __ovld __cnfn isequal(half2 x, half2 y);
+short3 __ovld __cnfn isequal(half3 x, half3 y);
+short4 __ovld __cnfn isequal(half4 x, half4 y);
+short8 __ovld __cnfn isequal(half8 x, half8 y);
+short16 __ovld __cnfn isequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x != y.
+ */
+int __ovld __cnfn isnotequal(float x, float y);
+int2 __ovld __cnfn isnotequal(float2 x, float2 y);
+int3 __ovld __cnfn isnotequal(float3 x, float3 y);
+int4 __ovld __cnfn isnotequal(float4 x, float4 y);
+int8 __ovld __cnfn isnotequal(float8 x, float8 y);
+int16 __ovld __cnfn isnotequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnotequal(double x, double y);
+long2 __ovld __cnfn isnotequal(double2 x, double2 y);
+long3 __ovld __cnfn isnotequal(double3 x, double3 y);
+long4 __ovld __cnfn isnotequal(double4 x, double4 y);
+long8 __ovld __cnfn isnotequal(double8 x, double8 y);
+long16 __ovld __cnfn isnotequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnotequal(half x, half y);
+short2 __ovld __cnfn isnotequal(half2 x, half2 y);
+short3 __ovld __cnfn isnotequal(half3 x, half3 y);
+short4 __ovld __cnfn isnotequal(half4 x, half4 y);
+short8 __ovld __cnfn isnotequal(half8 x, half8 y);
+short16 __ovld __cnfn isnotequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x > y.
+ */
+int __ovld __cnfn isgreater(float x, float y);
+int2 __ovld __cnfn isgreater(float2 x, float2 y);
+int3 __ovld __cnfn isgreater(float3 x, float3 y);
+int4 __ovld __cnfn isgreater(float4 x, float4 y);
+int8 __ovld __cnfn isgreater(float8 x, float8 y);
+int16 __ovld __cnfn isgreater(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isgreater(double x, double y);
+long2 __ovld __cnfn isgreater(double2 x, double2 y);
+long3 __ovld __cnfn isgreater(double3 x, double3 y);
+long4 __ovld __cnfn isgreater(double4 x, double4 y);
+long8 __ovld __cnfn isgreater(double8 x, double8 y);
+long16 __ovld __cnfn isgreater(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isgreater(half x, half y);
+short2 __ovld __cnfn isgreater(half2 x, half2 y);
+short3 __ovld __cnfn isgreater(half3 x, half3 y);
+short4 __ovld __cnfn isgreater(half4 x, half4 y);
+short8 __ovld __cnfn isgreater(half8 x, half8 y);
+short16 __ovld __cnfn isgreater(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x >= y.
+ */
+int __ovld __cnfn isgreaterequal(float x, float y);
+int2 __ovld __cnfn isgreaterequal(float2 x, float2 y);
+int3 __ovld __cnfn isgreaterequal(float3 x, float3 y);
+int4 __ovld __cnfn isgreaterequal(float4 x, float4 y);
+int8 __ovld __cnfn isgreaterequal(float8 x, float8 y);
+int16 __ovld __cnfn isgreaterequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isgreaterequal(double x, double y);
+long2 __ovld __cnfn isgreaterequal(double2 x, double2 y);
+long3 __ovld __cnfn isgreaterequal(double3 x, double3 y);
+long4 __ovld __cnfn isgreaterequal(double4 x, double4 y);
+long8 __ovld __cnfn isgreaterequal(double8 x, double8 y);
+long16 __ovld __cnfn isgreaterequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isgreaterequal(half x, half y);
+short2 __ovld __cnfn isgreaterequal(half2 x, half2 y);
+short3 __ovld __cnfn isgreaterequal(half3 x, half3 y);
+short4 __ovld __cnfn isgreaterequal(half4 x, half4 y);
+short8 __ovld __cnfn isgreaterequal(half8 x, half8 y);
+short16 __ovld __cnfn isgreaterequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x < y.
+ */
+int __ovld __cnfn isless(float x, float y);
+int2 __ovld __cnfn isless(float2 x, float2 y);
+int3 __ovld __cnfn isless(float3 x, float3 y);
+int4 __ovld __cnfn isless(float4 x, float4 y);
+int8 __ovld __cnfn isless(float8 x, float8 y);
+int16 __ovld __cnfn isless(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isless(double x, double y);
+long2 __ovld __cnfn isless(double2 x, double2 y);
+long3 __ovld __cnfn isless(double3 x, double3 y);
+long4 __ovld __cnfn isless(double4 x, double4 y);
+long8 __ovld __cnfn isless(double8 x, double8 y);
+long16 __ovld __cnfn isless(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isless(half x, half y);
+short2 __ovld __cnfn isless(half2 x, half2 y);
+short3 __ovld __cnfn isless(half3 x, half3 y);
+short4 __ovld __cnfn isless(half4 x, half4 y);
+short8 __ovld __cnfn isless(half8 x, half8 y);
+short16 __ovld __cnfn isless(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of x <= y.
+ */
+int __ovld __cnfn islessequal(float x, float y);
+int2 __ovld __cnfn islessequal(float2 x, float2 y);
+int3 __ovld __cnfn islessequal(float3 x, float3 y);
+int4 __ovld __cnfn islessequal(float4 x, float4 y);
+int8 __ovld __cnfn islessequal(float8 x, float8 y);
+int16 __ovld __cnfn islessequal(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn islessequal(double x, double y);
+long2 __ovld __cnfn islessequal(double2 x, double2 y);
+long3 __ovld __cnfn islessequal(double3 x, double3 y);
+long4 __ovld __cnfn islessequal(double4 x, double4 y);
+long8 __ovld __cnfn islessequal(double8 x, double8 y);
+long16 __ovld __cnfn islessequal(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn islessequal(half x, half y);
+short2 __ovld __cnfn islessequal(half2 x, half2 y);
+short3 __ovld __cnfn islessequal(half3 x, half3 y);
+short4 __ovld __cnfn islessequal(half4 x, half4 y);
+short8 __ovld __cnfn islessequal(half8 x, half8 y);
+short16 __ovld __cnfn islessequal(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Returns the component-wise compare of
+ * (x < y) || (x > y) .
+ */
+int __ovld __cnfn islessgreater(float x, float y);
+int2 __ovld __cnfn islessgreater(float2 x, float2 y);
+int3 __ovld __cnfn islessgreater(float3 x, float3 y);
+int4 __ovld __cnfn islessgreater(float4 x, float4 y);
+int8 __ovld __cnfn islessgreater(float8 x, float8 y);
+int16 __ovld __cnfn islessgreater(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn islessgreater(double x, double y);
+long2 __ovld __cnfn islessgreater(double2 x, double2 y);
+long3 __ovld __cnfn islessgreater(double3 x, double3 y);
+long4 __ovld __cnfn islessgreater(double4 x, double4 y);
+long8 __ovld __cnfn islessgreater(double8 x, double8 y);
+long16 __ovld __cnfn islessgreater(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn islessgreater(half x, half y);
+short2 __ovld __cnfn islessgreater(half2 x, half2 y);
+short3 __ovld __cnfn islessgreater(half3 x, half3 y);
+short4 __ovld __cnfn islessgreater(half4 x, half4 y);
+short8 __ovld __cnfn islessgreater(half8 x, half8 y);
+short16 __ovld __cnfn islessgreater(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test for finite value.
+ */
+int __ovld __cnfn isfinite(float);
+int2 __ovld __cnfn isfinite(float2);
+int3 __ovld __cnfn isfinite(float3);
+int4 __ovld __cnfn isfinite(float4);
+int8 __ovld __cnfn isfinite(float8);
+int16 __ovld __cnfn isfinite(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isfinite(double);
+long2 __ovld __cnfn isfinite(double2);
+long3 __ovld __cnfn isfinite(double3);
+long4 __ovld __cnfn isfinite(double4);
+long8 __ovld __cnfn isfinite(double8);
+long16 __ovld __cnfn isfinite(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isfinite(half);
+short2 __ovld __cnfn isfinite(half2);
+short3 __ovld __cnfn isfinite(half3);
+short4 __ovld __cnfn isfinite(half4);
+short8 __ovld __cnfn isfinite(half8);
+short16 __ovld __cnfn isfinite(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for infinity value (+ve or -ve) .
+ */
+int __ovld __cnfn isinf(float);
+int2 __ovld __cnfn isinf(float2);
+int3 __ovld __cnfn isinf(float3);
+int4 __ovld __cnfn isinf(float4);
+int8 __ovld __cnfn isinf(float8);
+int16 __ovld __cnfn isinf(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isinf(double);
+long2 __ovld __cnfn isinf(double2);
+long3 __ovld __cnfn isinf(double3);
+long4 __ovld __cnfn isinf(double4);
+long8 __ovld __cnfn isinf(double8);
+long16 __ovld __cnfn isinf(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isinf(half);
+short2 __ovld __cnfn isinf(half2);
+short3 __ovld __cnfn isinf(half3);
+short4 __ovld __cnfn isinf(half4);
+short8 __ovld __cnfn isinf(half8);
+short16 __ovld __cnfn isinf(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for a NaN.
+ */
+int __ovld __cnfn isnan(float);
+int2 __ovld __cnfn isnan(float2);
+int3 __ovld __cnfn isnan(float3);
+int4 __ovld __cnfn isnan(float4);
+int8 __ovld __cnfn isnan(float8);
+int16 __ovld __cnfn isnan(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnan(double);
+long2 __ovld __cnfn isnan(double2);
+long3 __ovld __cnfn isnan(double3);
+long4 __ovld __cnfn isnan(double4);
+long8 __ovld __cnfn isnan(double8);
+long16 __ovld __cnfn isnan(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnan(half);
+short2 __ovld __cnfn isnan(half2);
+short3 __ovld __cnfn isnan(half3);
+short4 __ovld __cnfn isnan(half4);
+short8 __ovld __cnfn isnan(half8);
+short16 __ovld __cnfn isnan(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test for a normal value.
+ */
+int __ovld __cnfn isnormal(float);
+int2 __ovld __cnfn isnormal(float2);
+int3 __ovld __cnfn isnormal(float3);
+int4 __ovld __cnfn isnormal(float4);
+int8 __ovld __cnfn isnormal(float8);
+int16 __ovld __cnfn isnormal(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isnormal(double);
+long2 __ovld __cnfn isnormal(double2);
+long3 __ovld __cnfn isnormal(double3);
+long4 __ovld __cnfn isnormal(double4);
+long8 __ovld __cnfn isnormal(double8);
+long16 __ovld __cnfn isnormal(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isnormal(half);
+short2 __ovld __cnfn isnormal(half2);
+short3 __ovld __cnfn isnormal(half3);
+short4 __ovld __cnfn isnormal(half4);
+short8 __ovld __cnfn isnormal(half8);
+short16 __ovld __cnfn isnormal(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Test if arguments are ordered. isordered() takes
+ * arguments x and y, and returns the result
+ * isequal(x, x) && isequal(y, y).
+ */
+int __ovld __cnfn isordered(float x, float y);
+int2 __ovld __cnfn isordered(float2 x, float2 y);
+int3 __ovld __cnfn isordered(float3 x, float3 y);
+int4 __ovld __cnfn isordered(float4 x, float4 y);
+int8 __ovld __cnfn isordered(float8 x, float8 y);
+int16 __ovld __cnfn isordered(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isordered(double x, double y);
+long2 __ovld __cnfn isordered(double2 x, double2 y);
+long3 __ovld __cnfn isordered(double3 x, double3 y);
+long4 __ovld __cnfn isordered(double4 x, double4 y);
+long8 __ovld __cnfn isordered(double8 x, double8 y);
+long16 __ovld __cnfn isordered(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isordered(half x, half y);
+short2 __ovld __cnfn isordered(half2 x, half2 y);
+short3 __ovld __cnfn isordered(half3 x, half3 y);
+short4 __ovld __cnfn isordered(half4 x, half4 y);
+short8 __ovld __cnfn isordered(half8 x, half8 y);
+short16 __ovld __cnfn isordered(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test if arguments are unordered. isunordered()
+ * takes arguments x and y, returning non-zero if x or y
+ * is NaN, and zero otherwise.
+ */
+int __ovld __cnfn isunordered(float x, float y);
+int2 __ovld __cnfn isunordered(float2 x, float2 y);
+int3 __ovld __cnfn isunordered(float3 x, float3 y);
+int4 __ovld __cnfn isunordered(float4 x, float4 y);
+int8 __ovld __cnfn isunordered(float8 x, float8 y);
+int16 __ovld __cnfn isunordered(float16 x, float16 y);
+#ifdef cl_khr_fp64
+int __ovld __cnfn isunordered(double x, double y);
+long2 __ovld __cnfn isunordered(double2 x, double2 y);
+long3 __ovld __cnfn isunordered(double3 x, double3 y);
+long4 __ovld __cnfn isunordered(double4 x, double4 y);
+long8 __ovld __cnfn isunordered(double8 x, double8 y);
+long16 __ovld __cnfn isunordered(double16 x, double16 y);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn isunordered(half x, half y);
+short2 __ovld __cnfn isunordered(half2 x, half2 y);
+short3 __ovld __cnfn isunordered(half3 x, half3 y);
+short4 __ovld __cnfn isunordered(half4 x, half4 y);
+short8 __ovld __cnfn isunordered(half8 x, half8 y);
+short16 __ovld __cnfn isunordered(half16 x, half16 y);
+#endif //cl_khr_fp16
+
+/**
+ * Test for sign bit. The scalar version of the function
+ * returns a 1 if the sign bit in the float is set else returns
+ * 0. The vector version of the function returns the
+ * following for each component in floatn: a -1 if the
+ * sign bit in the float is set else returns 0.
+ */
+int __ovld __cnfn signbit(float);
+int2 __ovld __cnfn signbit(float2);
+int3 __ovld __cnfn signbit(float3);
+int4 __ovld __cnfn signbit(float4);
+int8 __ovld __cnfn signbit(float8);
+int16 __ovld __cnfn signbit(float16);
+#ifdef cl_khr_fp64
+int __ovld __cnfn signbit(double);
+long2 __ovld __cnfn signbit(double2);
+long3 __ovld __cnfn signbit(double3);
+long4 __ovld __cnfn signbit(double4);
+long8 __ovld __cnfn signbit(double8);
+long16 __ovld __cnfn signbit(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+int __ovld __cnfn signbit(half);
+short2 __ovld __cnfn signbit(half2);
+short3 __ovld __cnfn signbit(half3);
+short4 __ovld __cnfn signbit(half4);
+short8 __ovld __cnfn signbit(half8);
+short16 __ovld __cnfn signbit(half16);
+#endif //cl_khr_fp16
+
+/**
+ * Returns 1 if the most significant bit in any component
+ * of x is set; otherwise returns 0.
+ */
+int __ovld __cnfn any(char x);
+int __ovld __cnfn any(char2 x);
+int __ovld __cnfn any(char3 x);
+int __ovld __cnfn any(char4 x);
+int __ovld __cnfn any(char8 x);
+int __ovld __cnfn any(char16 x);
+int __ovld __cnfn any(short x);
+int __ovld __cnfn any(short2 x);
+int __ovld __cnfn any(short3 x);
+int __ovld __cnfn any(short4 x);
+int __ovld __cnfn any(short8 x);
+int __ovld __cnfn any(short16 x);
+int __ovld __cnfn any(int x);
+int __ovld __cnfn any(int2 x);
+int __ovld __cnfn any(int3 x);
+int __ovld __cnfn any(int4 x);
+int __ovld __cnfn any(int8 x);
+int __ovld __cnfn any(int16 x);
+int __ovld __cnfn any(long x);
+int __ovld __cnfn any(long2 x);
+int __ovld __cnfn any(long3 x);
+int __ovld __cnfn any(long4 x);
+int __ovld __cnfn any(long8 x);
+int __ovld __cnfn any(long16 x);
+
+/**
+ * Returns 1 if the most significant bit in all components
+ * of x is set; otherwise returns 0.
+ */
+int __ovld __cnfn all(char x);
+int __ovld __cnfn all(char2 x);
+int __ovld __cnfn all(char3 x);
+int __ovld __cnfn all(char4 x);
+int __ovld __cnfn all(char8 x);
+int __ovld __cnfn all(char16 x);
+int __ovld __cnfn all(short x);
+int __ovld __cnfn all(short2 x);
+int __ovld __cnfn all(short3 x);
+int __ovld __cnfn all(short4 x);
+int __ovld __cnfn all(short8 x);
+int __ovld __cnfn all(short16 x);
+int __ovld __cnfn all(int x);
+int __ovld __cnfn all(int2 x);
+int __ovld __cnfn all(int3 x);
+int __ovld __cnfn all(int4 x);
+int __ovld __cnfn all(int8 x);
+int __ovld __cnfn all(int16 x);
+int __ovld __cnfn all(long x);
+int __ovld __cnfn all(long2 x);
+int __ovld __cnfn all(long3 x);
+int __ovld __cnfn all(long4 x);
+int __ovld __cnfn all(long8 x);
+int __ovld __cnfn all(long16 x);
+
+/**
+ * Each bit of the result is the corresponding bit of a if
+ * the corresponding bit of c is 0. Otherwise it is the
+ * corresponding bit of b.
+ */
+char __ovld __cnfn bitselect(char a, char b, char c);
+uchar __ovld __cnfn bitselect(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn bitselect(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn bitselect(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn bitselect(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn bitselect(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn bitselect(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn bitselect(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn bitselect(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn bitselect(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn bitselect(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn bitselect(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn bitselect(short a, short b, short c);
+ushort __ovld __cnfn bitselect(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn bitselect(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn bitselect(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn bitselect(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn bitselect(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn bitselect(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn bitselect(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn bitselect(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn bitselect(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn bitselect(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn bitselect(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn bitselect(int a, int b, int c);
+uint __ovld __cnfn bitselect(uint a, uint b, uint c);
+int2 __ovld __cnfn bitselect(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn bitselect(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn bitselect(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn bitselect(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn bitselect(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn bitselect(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn bitselect(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn bitselect(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn bitselect(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn bitselect(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn bitselect(long a, long b, long c);
+ulong __ovld __cnfn bitselect(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn bitselect(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn bitselect(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn bitselect(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn bitselect(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn bitselect(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn bitselect(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn bitselect(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn bitselect(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn bitselect(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn bitselect(ulong16 a, ulong16 b, ulong16 c);
+float __ovld __cnfn bitselect(float a, float b, float c);
+float2 __ovld __cnfn bitselect(float2 a, float2 b, float2 c);
+float3 __ovld __cnfn bitselect(float3 a, float3 b, float3 c);
+float4 __ovld __cnfn bitselect(float4 a, float4 b, float4 c);
+float8 __ovld __cnfn bitselect(float8 a, float8 b, float8 c);
+float16 __ovld __cnfn bitselect(float16 a, float16 b, float16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn bitselect(double a, double b, double c);
+double2 __ovld __cnfn bitselect(double2 a, double2 b, double2 c);
+double3 __ovld __cnfn bitselect(double3 a, double3 b, double3 c);
+double4 __ovld __cnfn bitselect(double4 a, double4 b, double4 c);
+double8 __ovld __cnfn bitselect(double8 a, double8 b, double8 c);
+double16 __ovld __cnfn bitselect(double16 a, double16 b, double16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn bitselect(half a, half b, half c);
+half2 __ovld __cnfn bitselect(half2 a, half2 b, half2 c);
+half3 __ovld __cnfn bitselect(half3 a, half3 b, half3 c);
+half4 __ovld __cnfn bitselect(half4 a, half4 b, half4 c);
+half8 __ovld __cnfn bitselect(half8 a, half8 b, half8 c);
+half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c);
+#endif //cl_khr_fp16
+
+/**
+ * For each component of a vector type,
+ * result[i] = if MSB of c[i] is set ? b[i] : a[i].
+ * For a scalar type, result = c ? b : a.
+ */
+char __ovld __cnfn select(char a, char b, char c);
+uchar __ovld __cnfn select(uchar a, uchar b, char c);
+char2 __ovld __cnfn select(char2 a, char2 b, char2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, char2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, char3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, char3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, char4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, char4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, char8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, char16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c);
+short __ovld __cnfn select(short a, short b, char c);
+ushort __ovld __cnfn select(ushort a, ushort b, char c);
+short2 __ovld __cnfn select(short2 a, short2 b, char2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, char3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, char4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, char8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, char16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c);
+int __ovld __cnfn select(int a, int b, char c);
+uint __ovld __cnfn select(uint a, uint b, char c);
+int2 __ovld __cnfn select(int2 a, int2 b, char2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, char3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, char4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, char8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, char16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c);
+long __ovld __cnfn select(long a, long b, char c);
+ulong __ovld __cnfn select(ulong a, ulong b, char c);
+long2 __ovld __cnfn select(long2 a, long2 b, char2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, char3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, char4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, char8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, char16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c);
+float __ovld __cnfn select(float a, float b, char c);
+float2 __ovld __cnfn select(float2 a, float2 b, char2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, char3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, char4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, char8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, char16 c);
+char __ovld __cnfn select(char a, char b, short c);
+uchar __ovld __cnfn select(uchar a, uchar b, short c);
+char2 __ovld __cnfn select(char2 a, char2 b, short2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, short3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, short4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, short8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, short16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c);
+short __ovld __cnfn select(short a, short b, short c);
+ushort __ovld __cnfn select(ushort a, ushort b, short c);
+short2 __ovld __cnfn select(short2 a, short2 b, short2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, short2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, short3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, short3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, short4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, short4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, short8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, short16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c);
+int __ovld __cnfn select(int a, int b, short c);
+uint __ovld __cnfn select(uint a, uint b, short c);
+int2 __ovld __cnfn select(int2 a, int2 b, short2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, short3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, short4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, short8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, short16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c);
+long __ovld __cnfn select(long a, long b, short c);
+ulong __ovld __cnfn select(ulong a, ulong b, short c);
+long2 __ovld __cnfn select(long2 a, long2 b, short2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, short3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, short4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, short8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, short16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c);
+float __ovld __cnfn select(float a, float b, short c);
+float2 __ovld __cnfn select(float2 a, float2 b, short2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, short3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, short4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, short8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, short16 c);
+char __ovld __cnfn select(char a, char b, int c);
+uchar __ovld __cnfn select(uchar a, uchar b, int c);
+char2 __ovld __cnfn select(char2 a, char2 b, int2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, int3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, int4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, int8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, int16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c);
+short __ovld __cnfn select(short a, short b, int c);
+ushort __ovld __cnfn select(ushort a, ushort b, int c);
+short2 __ovld __cnfn select(short2 a, short2 b, int2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, int3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, int4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, int8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, int16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c);
+int __ovld __cnfn select(int a, int b, int c);
+uint __ovld __cnfn select(uint a, uint b, int c);
+int2 __ovld __cnfn select(int2 a, int2 b, int2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, int2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, int3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, int3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, int4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, int4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, int8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, int16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c);
+long __ovld __cnfn select(long a, long b, int c);
+ulong __ovld __cnfn select(ulong a, ulong b, int c);
+long2 __ovld __cnfn select(long2 a, long2 b, int2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, int3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, int4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, int8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, int16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c);
+float __ovld __cnfn select(float a, float b, int c);
+float2 __ovld __cnfn select(float2 a, float2 b, int2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, int3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, int4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, int8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, int16 c);
+char __ovld __cnfn select(char a, char b, long c);
+uchar __ovld __cnfn select(uchar a, uchar b, long c);
+char2 __ovld __cnfn select(char2 a, char2 b, long2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, long3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, long4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, long8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, long16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c);
+short __ovld __cnfn select(short a, short b, long c);
+ushort __ovld __cnfn select(ushort a, ushort b, long c);
+short2 __ovld __cnfn select(short2 a, short2 b, long2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, long3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, long4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, long8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, long16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c);
+int __ovld __cnfn select(int a, int b, long c);
+uint __ovld __cnfn select(uint a, uint b, long c);
+int2 __ovld __cnfn select(int2 a, int2 b, long2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, long3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, long4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, long8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, long16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c);
+long __ovld __cnfn select(long a, long b, long c);
+ulong __ovld __cnfn select(ulong a, ulong b, long c);
+long2 __ovld __cnfn select(long2 a, long2 b, long2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, long2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, long3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, long3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, long4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, long4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, long8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, long16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c);
+float __ovld __cnfn select(float a, float b, long c);
+float2 __ovld __cnfn select(float2 a, float2 b, long2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, long3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, long4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, long8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, long16 c);
+char __ovld __cnfn select(char a, char b, uchar c);
+uchar __ovld __cnfn select(uchar a, uchar b, uchar c);
+char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uchar2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, uchar3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uchar3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, uchar4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uchar4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c);
+short __ovld __cnfn select(short a, short b, uchar c);
+ushort __ovld __cnfn select(ushort a, ushort b, uchar c);
+short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c);
+int __ovld __cnfn select(int a, int b, uchar c);
+uint __ovld __cnfn select(uint a, uint b, uchar c);
+int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c);
+long __ovld __cnfn select(long a, long b, uchar c);
+ulong __ovld __cnfn select(ulong a, ulong b, uchar c);
+long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c);
+float __ovld __cnfn select(float a, float b, uchar c);
+float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c);
+char __ovld __cnfn select(char a, char b, ushort c);
+uchar __ovld __cnfn select(uchar a, uchar b, ushort c);
+char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c);
+short __ovld __cnfn select(short a, short b, ushort c);
+ushort __ovld __cnfn select(ushort a, ushort b, ushort c);
+short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ushort2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, ushort3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ushort3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, ushort4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ushort4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c);
+int __ovld __cnfn select(int a, int b, ushort c);
+uint __ovld __cnfn select(uint a, uint b, ushort c);
+int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c);
+long __ovld __cnfn select(long a, long b, ushort c);
+ulong __ovld __cnfn select(ulong a, ulong b, ushort c);
+long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c);
+float __ovld __cnfn select(float a, float b, ushort c);
+float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c);
+char __ovld __cnfn select(char a, char b, uint c);
+uchar __ovld __cnfn select(uchar a, uchar b, uint c);
+char2 __ovld __cnfn select(char2 a, char2 b, uint2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, uint3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, uint4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, uint8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, uint16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c);
+short __ovld __cnfn select(short a, short b, uint c);
+ushort __ovld __cnfn select(ushort a, ushort b, uint c);
+short2 __ovld __cnfn select(short2 a, short2 b, uint2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, uint3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, uint4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, uint8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, uint16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c);
+int __ovld __cnfn select(int a, int b, uint c);
+uint __ovld __cnfn select(uint a, uint b, uint c);
+int2 __ovld __cnfn select(int2 a, int2 b, uint2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, uint2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, uint3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, uint3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, uint4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, uint4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, uint8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, uint16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c);
+long __ovld __cnfn select(long a, long b, uint c);
+ulong __ovld __cnfn select(ulong a, ulong b, uint c);
+long2 __ovld __cnfn select(long2 a, long2 b, uint2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, uint3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, uint4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, uint8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, uint16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c);
+float __ovld __cnfn select(float a, float b, uint c);
+float2 __ovld __cnfn select(float2 a, float2 b, uint2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, uint3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, uint4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, uint8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, uint16 c);
+char __ovld __cnfn select(char a, char b, ulong c);
+uchar __ovld __cnfn select(uchar a, uchar b, ulong c);
+char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c);
+uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c);
+char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c);
+uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c);
+char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c);
+uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c);
+char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c);
+uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c);
+char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c);
+uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c);
+short __ovld __cnfn select(short a, short b, ulong c);
+ushort __ovld __cnfn select(ushort a, ushort b, ulong c);
+short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c);
+ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c);
+short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c);
+ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c);
+short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c);
+ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c);
+short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c);
+ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c);
+short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c);
+ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c);
+int __ovld __cnfn select(int a, int b, ulong c);
+uint __ovld __cnfn select(uint a, uint b, ulong c);
+int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c);
+uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c);
+int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c);
+uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c);
+int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c);
+uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c);
+int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c);
+uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c);
+int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c);
+uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c);
+long __ovld __cnfn select(long a, long b, ulong c);
+ulong __ovld __cnfn select(ulong a, ulong b, ulong c);
+long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c);
+ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ulong2 c);
+long3 __ovld __cnfn select(long3 a, long3 b, ulong3 c);
+ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ulong3 c);
+long4 __ovld __cnfn select(long4 a, long4 b, ulong4 c);
+ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ulong4 c);
+long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c);
+ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c);
+long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c);
+ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c);
+float __ovld __cnfn select(float a, float b, ulong c);
+float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c);
+float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c);
+float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c);
+float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c);
+float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c);
+#ifdef cl_khr_fp64
+double __ovld __cnfn select(double a, double b, long c);
+double2 __ovld __cnfn select(double2 a, double2 b, long2 c);
+double3 __ovld __cnfn select(double3 a, double3 b, long3 c);
+double4 __ovld __cnfn select(double4 a, double4 b, long4 c);
+double8 __ovld __cnfn select(double8 a, double8 b, long8 c);
+double16 __ovld __cnfn select(double16 a, double16 b, long16 c);
+double __ovld __cnfn select(double a, double b, ulong c);
+double2 __ovld __cnfn select(double2 a, double2 b, ulong2 c);
+double3 __ovld __cnfn select(double3 a, double3 b, ulong3 c);
+double4 __ovld __cnfn select(double4 a, double4 b, ulong4 c);
+double8 __ovld __cnfn select(double8 a, double8 b, ulong8 c);
+double16 __ovld __cnfn select(double16 a, double16 b, ulong16 c);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn select(half a, half b, short c);
+half2 __ovld __cnfn select(half2 a, half2 b, short2 c);
+half3 __ovld __cnfn select(half3 a, half3 b, short3 c);
+half4 __ovld __cnfn select(half4 a, half4 b, short4 c);
+half8 __ovld __cnfn select(half8 a, half8 b, short8 c);
+half16 __ovld __cnfn select(half16 a, half16 b, short16 c);
+half __ovld __cnfn select(half a, half b, ushort c);
+half2 __ovld __cnfn select(half2 a, half2 b, ushort2 c);
+half3 __ovld __cnfn select(half3 a, half3 b, ushort3 c);
+half4 __ovld __cnfn select(half4 a, half4 b, ushort4 c);
+half8 __ovld __cnfn select(half8 a, half8 b, ushort8 c);
+half16 __ovld __cnfn select(half16 a, half16 b, ushort16 c);
+#endif //cl_khr_fp16
+
+// OpenCL v1.1 s6.11.7, v1.2 s6.12.7, v2.0 s6.13.7 - Vector Data Load and Store Functions
+// OpenCL extensions v1.1 s9.6.6, v1.2 s9.5.6, v2.0 s9.4.6 - Vector Data Load and Store Functions for Half Type
+/**
+ * Use generic type gentype to indicate the built-in data types
+ * char, uchar, short, ushort, int, uint, long, ulong, float,
+ * double or half.
+ *
+ * vloadn return sizeof (gentypen) bytes of data read from address (p + (offset * n)).
+ *
+ * vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)).
+ *
+ * The address computed as (p + (offset * n)) must be 
+ * 8-bit aligned if gentype is char, uchar;
+ * 16-bit aligned if gentype is short, ushort, half;
+ * 32-bit aligned if gentype is int, uint, float;
+ * 64-bit aligned if gentype is long, ulong, double.
+ */
+
+char2 __ovld vload2(size_t offset, const __constant char *p);
+uchar2 __ovld vload2(size_t offset, const __constant uchar *p);
+short2 __ovld vload2(size_t offset, const __constant short *p);
+ushort2 __ovld vload2(size_t offset, const __constant ushort *p);
+int2 __ovld vload2(size_t offset, const __constant int *p);
+uint2 __ovld vload2(size_t offset, const __constant uint *p);
+long2 __ovld vload2(size_t offset, const __constant long *p);
+ulong2 __ovld vload2(size_t offset, const __constant ulong *p);
+float2 __ovld vload2(size_t offset, const __constant float *p);
+char3 __ovld vload3(size_t offset, const __constant char *p);
+uchar3 __ovld vload3(size_t offset, const __constant uchar *p);
+short3 __ovld vload3(size_t offset, const __constant short *p);
+ushort3 __ovld vload3(size_t offset, const __constant ushort *p);
+int3 __ovld vload3(size_t offset, const __constant int *p);
+uint3 __ovld vload3(size_t offset, const __constant uint *p);
+long3 __ovld vload3(size_t offset, const __constant long *p);
+ulong3 __ovld vload3(size_t offset, const __constant ulong *p);
+float3 __ovld vload3(size_t offset, const __constant float *p);
+char4 __ovld vload4(size_t offset, const __constant char *p);
+uchar4 __ovld vload4(size_t offset, const __constant uchar *p);
+short4 __ovld vload4(size_t offset, const __constant short *p);
+ushort4 __ovld vload4(size_t offset, const __constant ushort *p);
+int4 __ovld vload4(size_t offset, const __constant int *p);
+uint4 __ovld vload4(size_t offset, const __constant uint *p);
+long4 __ovld vload4(size_t offset, const __constant long *p);
+ulong4 __ovld vload4(size_t offset, const __constant ulong *p);
+float4 __ovld vload4(size_t offset, const __constant float *p);
+char8 __ovld vload8(size_t offset, const __constant char *p);
+uchar8 __ovld vload8(size_t offset, const __constant uchar *p);
+short8 __ovld vload8(size_t offset, const __constant short *p);
+ushort8 __ovld vload8(size_t offset, const __constant ushort *p);
+int8 __ovld vload8(size_t offset, const __constant int *p);
+uint8 __ovld vload8(size_t offset, const __constant uint *p);
+long8 __ovld vload8(size_t offset, const __constant long *p);
+ulong8 __ovld vload8(size_t offset, const __constant ulong *p);
+float8 __ovld vload8(size_t offset, const __constant float *p);
+char16 __ovld vload16(size_t offset, const __constant char *p);
+uchar16 __ovld vload16(size_t offset, const __constant uchar *p);
+short16 __ovld vload16(size_t offset, const __constant short *p);
+ushort16 __ovld vload16(size_t offset, const __constant ushort *p);
+int16 __ovld vload16(size_t offset, const __constant int *p);
+uint16 __ovld vload16(size_t offset, const __constant uint *p);
+long16 __ovld vload16(size_t offset, const __constant long *p);
+ulong16 __ovld vload16(size_t offset, const __constant ulong *p);
+float16 __ovld vload16(size_t offset, const __constant float *p);
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const __constant double *p);
+double3 __ovld vload3(size_t offset, const __constant double *p);
+double4 __ovld vload4(size_t offset, const __constant double *p);
+double8 __ovld vload8(size_t offset, const __constant double *p);
+double16 __ovld vload16(size_t offset, const __constant double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const __constant half *p);
+half2 __ovld vload2(size_t offset, const __constant half *p);
+half3 __ovld vload3(size_t offset, const __constant half *p);
+half4 __ovld vload4(size_t offset, const __constant half *p);
+half8 __ovld vload8(size_t offset, const __constant half *p);
+half16 __ovld vload16(size_t offset, const __constant half *p);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+char2 __ovld vload2(size_t offset, const char *p);
+uchar2 __ovld vload2(size_t offset, const uchar *p);
+short2 __ovld vload2(size_t offset, const short *p);
+ushort2 __ovld vload2(size_t offset, const ushort *p);
+int2 __ovld vload2(size_t offset, const int *p);
+uint2 __ovld vload2(size_t offset, const uint *p);
+long2 __ovld vload2(size_t offset, const long *p);
+ulong2 __ovld vload2(size_t offset, const ulong *p);
+float2 __ovld vload2(size_t offset, const float *p);
+char3 __ovld vload3(size_t offset, const char *p);
+uchar3 __ovld vload3(size_t offset, const uchar *p);
+short3 __ovld vload3(size_t offset, const short *p);
+ushort3 __ovld vload3(size_t offset, const ushort *p);
+int3 __ovld vload3(size_t offset, const int *p);
+uint3 __ovld vload3(size_t offset, const uint *p);
+long3 __ovld vload3(size_t offset, const long *p);
+ulong3 __ovld vload3(size_t offset, const ulong *p);
+float3 __ovld vload3(size_t offset, const float *p);
+char4 __ovld vload4(size_t offset, const char *p);
+uchar4 __ovld vload4(size_t offset, const uchar *p);
+short4 __ovld vload4(size_t offset, const short *p);
+ushort4 __ovld vload4(size_t offset, const ushort *p);
+int4 __ovld vload4(size_t offset, const int *p);
+uint4 __ovld vload4(size_t offset, const uint *p);
+long4 __ovld vload4(size_t offset, const long *p);
+ulong4 __ovld vload4(size_t offset, const ulong *p);
+float4 __ovld vload4(size_t offset, const float *p);
+char8 __ovld vload8(size_t offset, const char *p);
+uchar8 __ovld vload8(size_t offset, const uchar *p);
+short8 __ovld vload8(size_t offset, const short *p);
+ushort8 __ovld vload8(size_t offset, const ushort *p);
+int8 __ovld vload8(size_t offset, const int *p);
+uint8 __ovld vload8(size_t offset, const uint *p);
+long8 __ovld vload8(size_t offset, const long *p);
+ulong8 __ovld vload8(size_t offset, const ulong *p);
+float8 __ovld vload8(size_t offset, const float *p);
+char16 __ovld vload16(size_t offset, const char *p);
+uchar16 __ovld vload16(size_t offset, const uchar *p);
+short16 __ovld vload16(size_t offset, const short *p);
+ushort16 __ovld vload16(size_t offset, const ushort *p);
+int16 __ovld vload16(size_t offset, const int *p);
+uint16 __ovld vload16(size_t offset, const uint *p);
+long16 __ovld vload16(size_t offset, const long *p);
+ulong16 __ovld vload16(size_t offset, const ulong *p);
+float16 __ovld vload16(size_t offset, const float *p);
+
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const double *p);
+double3 __ovld vload3(size_t offset, const double *p);
+double4 __ovld vload4(size_t offset, const double *p);
+double8 __ovld vload8(size_t offset, const double *p);
+double16 __ovld vload16(size_t offset, const double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const half *p);
+half2 __ovld vload2(size_t offset, const half *p);
+half3 __ovld vload3(size_t offset, const half *p);
+half4 __ovld vload4(size_t offset, const half *p);
+half8 __ovld vload8(size_t offset, const half *p);
+half16 __ovld vload16(size_t offset, const half *p);
+#endif //cl_khr_fp16
+#else
+char2 __ovld vload2(size_t offset, const __global char *p);
+uchar2 __ovld vload2(size_t offset, const __global uchar *p);
+short2 __ovld vload2(size_t offset, const __global short *p);
+ushort2 __ovld vload2(size_t offset, const __global ushort *p);
+int2 __ovld vload2(size_t offset, const __global int *p);
+uint2 __ovld vload2(size_t offset, const __global uint *p);
+long2 __ovld vload2(size_t offset, const __global long *p);
+ulong2 __ovld vload2(size_t offset, const __global ulong *p);
+float2 __ovld vload2(size_t offset, const __global float *p);
+char3 __ovld vload3(size_t offset, const __global char *p);
+uchar3 __ovld vload3(size_t offset, const __global uchar *p);
+short3 __ovld vload3(size_t offset, const __global short *p);
+ushort3 __ovld vload3(size_t offset, const __global ushort *p);
+int3 __ovld vload3(size_t offset, const __global int *p);
+uint3 __ovld vload3(size_t offset, const __global uint *p);
+long3 __ovld vload3(size_t offset, const __global long *p);
+ulong3 __ovld vload3(size_t offset, const __global ulong *p);
+float3 __ovld vload3(size_t offset, const __global float *p);
+char4 __ovld vload4(size_t offset, const __global char *p);
+uchar4 __ovld vload4(size_t offset, const __global uchar *p);
+short4 __ovld vload4(size_t offset, const __global short *p);
+ushort4 __ovld vload4(size_t offset, const __global ushort *p);
+int4 __ovld vload4(size_t offset, const __global int *p);
+uint4 __ovld vload4(size_t offset, const __global uint *p);
+long4 __ovld vload4(size_t offset, const __global long *p);
+ulong4 __ovld vload4(size_t offset, const __global ulong *p);
+float4 __ovld vload4(size_t offset, const __global float *p);
+char8 __ovld vload8(size_t offset, const __global char *p);
+uchar8 __ovld vload8(size_t offset, const __global uchar *p);
+short8 __ovld vload8(size_t offset, const __global short *p);
+ushort8 __ovld vload8(size_t offset, const __global ushort *p);
+int8 __ovld vload8(size_t offset, const __global int *p);
+uint8 __ovld vload8(size_t offset, const __global uint *p);
+long8 __ovld vload8(size_t offset, const __global long *p);
+ulong8 __ovld vload8(size_t offset, const __global ulong *p);
+float8 __ovld vload8(size_t offset, const __global float *p);
+char16 __ovld vload16(size_t offset, const __global char *p);
+uchar16 __ovld vload16(size_t offset, const __global uchar *p);
+short16 __ovld vload16(size_t offset, const __global short *p);
+ushort16 __ovld vload16(size_t offset, const __global ushort *p);
+int16 __ovld vload16(size_t offset, const __global int *p);
+uint16 __ovld vload16(size_t offset, const __global uint *p);
+long16 __ovld vload16(size_t offset, const __global long *p);
+ulong16 __ovld vload16(size_t offset, const __global ulong *p);
+float16 __ovld vload16(size_t offset, const __global float *p);
+char2 __ovld vload2(size_t offset, const __local char *p);
+uchar2 __ovld vload2(size_t offset, const __local uchar *p);
+short2 __ovld vload2(size_t offset, const __local short *p);
+ushort2 __ovld vload2(size_t offset, const __local ushort *p);
+int2 __ovld vload2(size_t offset, const __local int *p);
+uint2 __ovld vload2(size_t offset, const __local uint *p);
+long2 __ovld vload2(size_t offset, const __local long *p);
+ulong2 __ovld vload2(size_t offset, const __local ulong *p);
+float2 __ovld vload2(size_t offset, const __local float *p);
+char3 __ovld vload3(size_t offset, const __local char *p);
+uchar3 __ovld vload3(size_t offset, const __local uchar *p);
+short3 __ovld vload3(size_t offset, const __local short *p);
+ushort3 __ovld vload3(size_t offset, const __local ushort *p);
+int3 __ovld vload3(size_t offset, const __local int *p);
+uint3 __ovld vload3(size_t offset, const __local uint *p);
+long3 __ovld vload3(size_t offset, const __local long *p);
+ulong3 __ovld vload3(size_t offset, const __local ulong *p);
+float3 __ovld vload3(size_t offset, const __local float *p);
+char4 __ovld vload4(size_t offset, const __local char *p);
+uchar4 __ovld vload4(size_t offset, const __local uchar *p);
+short4 __ovld vload4(size_t offset, const __local short *p);
+ushort4 __ovld vload4(size_t offset, const __local ushort *p);
+int4 __ovld vload4(size_t offset, const __local int *p);
+uint4 __ovld vload4(size_t offset, const __local uint *p);
+long4 __ovld vload4(size_t offset, const __local long *p);
+ulong4 __ovld vload4(size_t offset, const __local ulong *p);
+float4 __ovld vload4(size_t offset, const __local float *p);
+char8 __ovld vload8(size_t offset, const __local char *p);
+uchar8 __ovld vload8(size_t offset, const __local uchar *p);
+short8 __ovld vload8(size_t offset, const __local short *p);
+ushort8 __ovld vload8(size_t offset, const __local ushort *p);
+int8 __ovld vload8(size_t offset, const __local int *p);
+uint8 __ovld vload8(size_t offset, const __local uint *p);
+long8 __ovld vload8(size_t offset, const __local long *p);
+ulong8 __ovld vload8(size_t offset, const __local ulong *p);
+float8 __ovld vload8(size_t offset, const __local float *p);
+char16 __ovld vload16(size_t offset, const __local char *p);
+uchar16 __ovld vload16(size_t offset, const __local uchar *p);
+short16 __ovld vload16(size_t offset, const __local short *p);
+ushort16 __ovld vload16(size_t offset, const __local ushort *p);
+int16 __ovld vload16(size_t offset, const __local int *p);
+uint16 __ovld vload16(size_t offset, const __local uint *p);
+long16 __ovld vload16(size_t offset, const __local long *p);
+ulong16 __ovld vload16(size_t offset, const __local ulong *p);
+float16 __ovld vload16(size_t offset, const __local float *p);
+char2 __ovld vload2(size_t offset, const __private char *p);
+uchar2 __ovld vload2(size_t offset, const __private uchar *p);
+short2 __ovld vload2(size_t offset, const __private short *p);
+ushort2 __ovld vload2(size_t offset, const __private ushort *p);
+int2 __ovld vload2(size_t offset, const __private int *p);
+uint2 __ovld vload2(size_t offset, const __private uint *p);
+long2 __ovld vload2(size_t offset, const __private long *p);
+ulong2 __ovld vload2(size_t offset, const __private ulong *p);
+float2 __ovld vload2(size_t offset, const __private float *p);
+char3 __ovld vload3(size_t offset, const __private char *p);
+uchar3 __ovld vload3(size_t offset, const __private uchar *p);
+short3 __ovld vload3(size_t offset, const __private short *p);
+ushort3 __ovld vload3(size_t offset, const __private ushort *p);
+int3 __ovld vload3(size_t offset, const __private int *p);
+uint3 __ovld vload3(size_t offset, const __private uint *p);
+long3 __ovld vload3(size_t offset, const __private long *p);
+ulong3 __ovld vload3(size_t offset, const __private ulong *p);
+float3 __ovld vload3(size_t offset, const __private float *p);
+char4 __ovld vload4(size_t offset, const __private char *p);
+uchar4 __ovld vload4(size_t offset, const __private uchar *p);
+short4 __ovld vload4(size_t offset, const __private short *p);
+ushort4 __ovld vload4(size_t offset, const __private ushort *p);
+int4 __ovld vload4(size_t offset, const __private int *p);
+uint4 __ovld vload4(size_t offset, const __private uint *p);
+long4 __ovld vload4(size_t offset, const __private long *p);
+ulong4 __ovld vload4(size_t offset, const __private ulong *p);
+float4 __ovld vload4(size_t offset, const __private float *p);
+char8 __ovld vload8(size_t offset, const __private char *p);
+uchar8 __ovld vload8(size_t offset, const __private uchar *p);
+short8 __ovld vload8(size_t offset, const __private short *p);
+ushort8 __ovld vload8(size_t offset, const __private ushort *p);
+int8 __ovld vload8(size_t offset, const __private int *p);
+uint8 __ovld vload8(size_t offset, const __private uint *p);
+long8 __ovld vload8(size_t offset, const __private long *p);
+ulong8 __ovld vload8(size_t offset, const __private ulong *p);
+float8 __ovld vload8(size_t offset, const __private float *p);
+char16 __ovld vload16(size_t offset, const __private char *p);
+uchar16 __ovld vload16(size_t offset, const __private uchar *p);
+short16 __ovld vload16(size_t offset, const __private short *p);
+ushort16 __ovld vload16(size_t offset, const __private ushort *p);
+int16 __ovld vload16(size_t offset, const __private int *p);
+uint16 __ovld vload16(size_t offset, const __private uint *p);
+long16 __ovld vload16(size_t offset, const __private long *p);
+ulong16 __ovld vload16(size_t offset, const __private ulong *p);
+float16 __ovld vload16(size_t offset, const __private float *p);
+
+#ifdef cl_khr_fp64
+double2 __ovld vload2(size_t offset, const __global double *p);
+double3 __ovld vload3(size_t offset, const __global double *p);
+double4 __ovld vload4(size_t offset, const __global double *p);
+double8 __ovld vload8(size_t offset, const __global double *p);
+double16 __ovld vload16(size_t offset, const __global double *p);
+double2 __ovld vload2(size_t offset, const __local double *p);
+double3 __ovld vload3(size_t offset, const __local double *p);
+double4 __ovld vload4(size_t offset, const __local double *p);
+double8 __ovld vload8(size_t offset, const __local double *p);
+double16 __ovld vload16(size_t offset, const __local double *p);
+double2 __ovld vload2(size_t offset, const __private double *p);
+double3 __ovld vload3(size_t offset, const __private double *p);
+double4 __ovld vload4(size_t offset, const __private double *p);
+double8 __ovld vload8(size_t offset, const __private double *p);
+double16 __ovld vload16(size_t offset, const __private double *p);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld vload(size_t offset, const __global half *p);
+half2 __ovld vload2(size_t offset, const __global half *p);
+half3 __ovld vload3(size_t offset, const __global half *p);
+half4 __ovld vload4(size_t offset, const __global half *p);
+half8 __ovld vload8(size_t offset, const __global half *p);
+half16 __ovld vload16(size_t offset, const __global half *p);
+half __ovld vload(size_t offset, const __local half *p);
+half2 __ovld vload2(size_t offset, const __local half *p);
+half3 __ovld vload3(size_t offset, const __local half *p);
+half4 __ovld vload4(size_t offset, const __local half *p);
+half8 __ovld vload8(size_t offset, const __local half *p);
+half16 __ovld vload16(size_t offset, const __local half *p);
+half __ovld vload(size_t offset, const __private half *p);
+half2 __ovld vload2(size_t offset, const __private half *p);
+half3 __ovld vload3(size_t offset, const __private half *p);
+half4 __ovld vload4(size_t offset, const __private half *p);
+half8 __ovld vload8(size_t offset, const __private half *p);
+half16 __ovld vload16(size_t offset, const __private half *p);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore2(char2 data, size_t offset, char *p);
+void __ovld vstore2(uchar2 data, size_t offset, uchar *p);
+void __ovld vstore2(short2 data, size_t offset, short *p);
+void __ovld vstore2(ushort2 data, size_t offset, ushort *p);
+void __ovld vstore2(int2 data, size_t offset, int *p);
+void __ovld vstore2(uint2 data, size_t offset, uint *p);
+void __ovld vstore2(long2 data, size_t offset, long *p);
+void __ovld vstore2(ulong2 data, size_t offset, ulong *p);
+void __ovld vstore2(float2 data, size_t offset, float *p);
+void __ovld vstore3(char3 data, size_t offset, char *p);
+void __ovld vstore3(uchar3 data, size_t offset, uchar *p);
+void __ovld vstore3(short3 data, size_t offset, short *p);
+void __ovld vstore3(ushort3 data, size_t offset, ushort *p);
+void __ovld vstore3(int3 data, size_t offset, int *p);
+void __ovld vstore3(uint3 data, size_t offset, uint *p);
+void __ovld vstore3(long3 data, size_t offset, long *p);
+void __ovld vstore3(ulong3 data, size_t offset, ulong *p);
+void __ovld vstore3(float3 data, size_t offset, float *p);
+void __ovld vstore4(char4 data, size_t offset, char *p);
+void __ovld vstore4(uchar4 data, size_t offset, uchar *p);
+void __ovld vstore4(short4 data, size_t offset, short *p);
+void __ovld vstore4(ushort4 data, size_t offset, ushort *p);
+void __ovld vstore4(int4 data, size_t offset, int *p);
+void __ovld vstore4(uint4 data, size_t offset, uint *p);
+void __ovld vstore4(long4 data, size_t offset, long *p);
+void __ovld vstore4(ulong4 data, size_t offset, ulong *p);
+void __ovld vstore4(float4 data, size_t offset, float *p);
+void __ovld vstore8(char8 data, size_t offset, char *p);
+void __ovld vstore8(uchar8 data, size_t offset, uchar *p);
+void __ovld vstore8(short8 data, size_t offset, short *p);
+void __ovld vstore8(ushort8 data, size_t offset, ushort *p);
+void __ovld vstore8(int8 data, size_t offset, int *p);
+void __ovld vstore8(uint8 data, size_t offset, uint *p);
+void __ovld vstore8(long8 data, size_t offset, long *p);
+void __ovld vstore8(ulong8 data, size_t offset, ulong *p);
+void __ovld vstore8(float8 data, size_t offset, float *p);
+void __ovld vstore16(char16 data, size_t offset, char *p);
+void __ovld vstore16(uchar16 data, size_t offset, uchar *p);
+void __ovld vstore16(short16 data, size_t offset, short *p);
+void __ovld vstore16(ushort16 data, size_t offset, ushort *p);
+void __ovld vstore16(int16 data, size_t offset, int *p);
+void __ovld vstore16(uint16 data, size_t offset, uint *p);
+void __ovld vstore16(long16 data, size_t offset, long *p);
+void __ovld vstore16(ulong16 data, size_t offset, ulong *p);
+void __ovld vstore16(float16 data, size_t offset, float *p);
+#ifdef cl_khr_fp64
+void __ovld vstore2(double2 data, size_t offset, double *p);
+void __ovld vstore3(double3 data, size_t offset, double *p);
+void __ovld vstore4(double4 data, size_t offset, double *p);
+void __ovld vstore8(double8 data, size_t offset, double *p);
+void __ovld vstore16(double16 data, size_t offset, double *p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld vstore(half data, size_t offset, half *p);
+void __ovld vstore2(half2 data, size_t offset, half *p);
+void __ovld vstore3(half3 data, size_t offset, half *p);
+void __ovld vstore4(half4 data, size_t offset, half *p);
+void __ovld vstore8(half8 data, size_t offset, half *p);
+void __ovld vstore16(half16 data, size_t offset, half *p);
+#endif //cl_khr_fp16
+#else
+void __ovld vstore2(char2 data, size_t offset, __global char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __global uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __global short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __global ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __global int *p);
+void __ovld vstore2(uint2 data, size_t offset, __global uint *p);
+void __ovld vstore2(long2 data, size_t offset, __global long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __global ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __global float *p);
+void __ovld vstore3(char3 data, size_t offset, __global char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __global uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __global short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __global ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __global int *p);
+void __ovld vstore3(uint3 data, size_t offset, __global uint *p);
+void __ovld vstore3(long3 data, size_t offset, __global long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __global ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __global float *p);
+void __ovld vstore4(char4 data, size_t offset, __global char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __global uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __global short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __global ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __global int *p);
+void __ovld vstore4(uint4 data, size_t offset, __global uint *p);
+void __ovld vstore4(long4 data, size_t offset, __global long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __global ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __global float *p);
+void __ovld vstore8(char8 data, size_t offset, __global char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __global uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __global short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __global ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __global int *p);
+void __ovld vstore8(uint8 data, size_t offset, __global uint *p);
+void __ovld vstore8(long8 data, size_t offset, __global long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __global ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __global float *p);
+void __ovld vstore16(char16 data, size_t offset, __global char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __global uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __global short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __global ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __global int *p);
+void __ovld vstore16(uint16 data, size_t offset, __global uint *p);
+void __ovld vstore16(long16 data, size_t offset, __global long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __global ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __global float *p);
+void __ovld vstore2(char2 data, size_t offset, __local char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __local uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __local short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __local ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __local int *p);
+void __ovld vstore2(uint2 data, size_t offset, __local uint *p);
+void __ovld vstore2(long2 data, size_t offset, __local long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __local ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __local float *p);
+void __ovld vstore3(char3 data, size_t offset, __local char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __local uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __local short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __local ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __local int *p);
+void __ovld vstore3(uint3 data, size_t offset, __local uint *p);
+void __ovld vstore3(long3 data, size_t offset, __local long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __local ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __local float *p);
+void __ovld vstore4(char4 data, size_t offset, __local char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __local uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __local short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __local ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __local int *p);
+void __ovld vstore4(uint4 data, size_t offset, __local uint *p);
+void __ovld vstore4(long4 data, size_t offset, __local long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __local ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __local float *p);
+void __ovld vstore8(char8 data, size_t offset, __local char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __local uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __local short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __local ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __local int *p);
+void __ovld vstore8(uint8 data, size_t offset, __local uint *p);
+void __ovld vstore8(long8 data, size_t offset, __local long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __local ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __local float *p);
+void __ovld vstore16(char16 data, size_t offset, __local char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __local uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __local short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __local ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __local int *p);
+void __ovld vstore16(uint16 data, size_t offset, __local uint *p);
+void __ovld vstore16(long16 data, size_t offset, __local long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __local ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __local float *p);
+void __ovld vstore2(char2 data, size_t offset, __private char *p);
+void __ovld vstore2(uchar2 data, size_t offset, __private uchar *p);
+void __ovld vstore2(short2 data, size_t offset, __private short *p);
+void __ovld vstore2(ushort2 data, size_t offset, __private ushort *p);
+void __ovld vstore2(int2 data, size_t offset, __private int *p);
+void __ovld vstore2(uint2 data, size_t offset, __private uint *p);
+void __ovld vstore2(long2 data, size_t offset, __private long *p);
+void __ovld vstore2(ulong2 data, size_t offset, __private ulong *p);
+void __ovld vstore2(float2 data, size_t offset, __private float *p);
+void __ovld vstore3(char3 data, size_t offset, __private char *p);
+void __ovld vstore3(uchar3 data, size_t offset, __private uchar *p);
+void __ovld vstore3(short3 data, size_t offset, __private short *p);
+void __ovld vstore3(ushort3 data, size_t offset, __private ushort *p);
+void __ovld vstore3(int3 data, size_t offset, __private int *p);
+void __ovld vstore3(uint3 data, size_t offset, __private uint *p);
+void __ovld vstore3(long3 data, size_t offset, __private long *p);
+void __ovld vstore3(ulong3 data, size_t offset, __private ulong *p);
+void __ovld vstore3(float3 data, size_t offset, __private float *p);
+void __ovld vstore4(char4 data, size_t offset, __private char *p);
+void __ovld vstore4(uchar4 data, size_t offset, __private uchar *p);
+void __ovld vstore4(short4 data, size_t offset, __private short *p);
+void __ovld vstore4(ushort4 data, size_t offset, __private ushort *p);
+void __ovld vstore4(int4 data, size_t offset, __private int *p);
+void __ovld vstore4(uint4 data, size_t offset, __private uint *p);
+void __ovld vstore4(long4 data, size_t offset, __private long *p);
+void __ovld vstore4(ulong4 data, size_t offset, __private ulong *p);
+void __ovld vstore4(float4 data, size_t offset, __private float *p);
+void __ovld vstore8(char8 data, size_t offset, __private char *p);
+void __ovld vstore8(uchar8 data, size_t offset, __private uchar *p);
+void __ovld vstore8(short8 data, size_t offset, __private short *p);
+void __ovld vstore8(ushort8 data, size_t offset, __private ushort *p);
+void __ovld vstore8(int8 data, size_t offset, __private int *p);
+void __ovld vstore8(uint8 data, size_t offset, __private uint *p);
+void __ovld vstore8(long8 data, size_t offset, __private long *p);
+void __ovld vstore8(ulong8 data, size_t offset, __private ulong *p);
+void __ovld vstore8(float8 data, size_t offset, __private float *p);
+void __ovld vstore16(char16 data, size_t offset, __private char *p);
+void __ovld vstore16(uchar16 data, size_t offset, __private uchar *p);
+void __ovld vstore16(short16 data, size_t offset, __private short *p);
+void __ovld vstore16(ushort16 data, size_t offset, __private ushort *p);
+void __ovld vstore16(int16 data, size_t offset, __private int *p);
+void __ovld vstore16(uint16 data, size_t offset, __private uint *p);
+void __ovld vstore16(long16 data, size_t offset, __private long *p);
+void __ovld vstore16(ulong16 data, size_t offset, __private ulong *p);
+void __ovld vstore16(float16 data, size_t offset, __private float *p);
+#ifdef cl_khr_fp64
+void __ovld vstore2(double2 data, size_t offset, __global double *p);
+void __ovld vstore3(double3 data, size_t offset, __global double *p);
+void __ovld vstore4(double4 data, size_t offset, __global double *p);
+void __ovld vstore8(double8 data, size_t offset, __global double *p);
+void __ovld vstore16(double16 data, size_t offset, __global double *p);
+void __ovld vstore2(double2 data, size_t offset, __local double *p);
+void __ovld vstore3(double3 data, size_t offset, __local double *p);
+void __ovld vstore4(double4 data, size_t offset, __local double *p);
+void __ovld vstore8(double8 data, size_t offset, __local double *p);
+void __ovld vstore16(double16 data, size_t offset, __local double *p);
+void __ovld vstore2(double2 data, size_t offset, __private double *p);
+void __ovld vstore3(double3 data, size_t offset, __private double *p);
+void __ovld vstore4(double4 data, size_t offset, __private double *p);
+void __ovld vstore8(double8 data, size_t offset, __private double *p);
+void __ovld vstore16(double16 data, size_t offset, __private double *p);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld vstore(half data, size_t offset, __global half *p);
+void __ovld vstore2(half2 data, size_t offset, __global half *p);
+void __ovld vstore3(half3 data, size_t offset, __global half *p);
+void __ovld vstore4(half4 data, size_t offset, __global half *p);
+void __ovld vstore8(half8 data, size_t offset, __global half *p);
+void __ovld vstore16(half16 data, size_t offset, __global half *p);
+void __ovld vstore(half data, size_t offset, __local half *p);
+void __ovld vstore2(half2 data, size_t offset, __local half *p);
+void __ovld vstore3(half3 data, size_t offset, __local half *p);
+void __ovld vstore4(half4 data, size_t offset, __local half *p);
+void __ovld vstore8(half8 data, size_t offset, __local half *p);
+void __ovld vstore16(half16 data, size_t offset, __local half *p);
+void __ovld vstore(half data, size_t offset, __private half *p);
+void __ovld vstore2(half2 data, size_t offset, __private half *p);
+void __ovld vstore3(half3 data, size_t offset, __private half *p);
+void __ovld vstore4(half4 data, size_t offset, __private half *p);
+void __ovld vstore8(half8 data, size_t offset, __private half *p);
+void __ovld vstore16(half16 data, size_t offset, __private half *p);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Read sizeof (half) bytes of data from address
+ * (p + offset). The data read is interpreted as a
+ * half value. The half value is converted to a
+ * float value and the float value is returned.
+ * The read address computed as (p + offset)
+ * must be 16-bit aligned.
+ */
+float __ovld vload_half(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld vload_half(size_t offset, const half *p);
+#else
+float __ovld vload_half(size_t offset, const __global half *p);
+float __ovld vload_half(size_t offset, const __local half *p);
+float __ovld vload_half(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Read sizeof (halfn) bytes of data from address
+ * (p + (offset * n)). The data read is interpreted
+ * as a halfn value. The halfn value read is
+ * converted to a floatn value and the floatn
+ * value is returned. The read address computed
+ * as (p + (offset * n)) must be 16-bit aligned.
+ */
+float2 __ovld vload_half2(size_t offset, const __constant half *p);
+float3 __ovld vload_half3(size_t offset, const __constant half *p);
+float4 __ovld vload_half4(size_t offset, const __constant half *p);
+float8 __ovld vload_half8(size_t offset, const __constant half *p);
+float16 __ovld vload_half16(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float2 __ovld vload_half2(size_t offset, const half *p);
+float3 __ovld vload_half3(size_t offset, const half *p);
+float4 __ovld vload_half4(size_t offset, const half *p);
+float8 __ovld vload_half8(size_t offset, const half *p);
+float16 __ovld vload_half16(size_t offset, const half *p);
+#else
+float2 __ovld vload_half2(size_t offset, const __global half *p);
+float3 __ovld vload_half3(size_t offset, const __global half *p);
+float4 __ovld vload_half4(size_t offset, const __global half *p);
+float8 __ovld vload_half8(size_t offset, const __global half *p);
+float16 __ovld vload_half16(size_t offset, const __global half *p);
+float2 __ovld vload_half2(size_t offset, const __local half *p);
+float3 __ovld vload_half3(size_t offset, const __local half *p);
+float4 __ovld vload_half4(size_t offset, const __local half *p);
+float8 __ovld vload_half8(size_t offset, const __local half *p);
+float16 __ovld vload_half16(size_t offset, const __local half *p);
+float2 __ovld vload_half2(size_t offset, const __private half *p);
+float3 __ovld vload_half3(size_t offset, const __private half *p);
+float4 __ovld vload_half4(size_t offset, const __private half *p);
+float8 __ovld vload_half8(size_t offset, const __private half *p);
+float16 __ovld vload_half16(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The float value given by data is first
+ * converted to a half value using the appropriate
+ * rounding mode. The half value is then written
+ * to address computed as (p + offset). The
+ * address computed as (p + offset) must be 16-
+ * bit aligned.
+ * vstore_half use the current rounding mode.
+ * The default current rounding mode is round to
+ * nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore_half(float data, size_t offset, half *p);
+void __ovld vstore_half_rte(float data, size_t offset, half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half(double data, size_t offset, half *p);
+void __ovld vstore_half_rte(double data, size_t offset, half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, half *p);
+#endif //cl_khr_fp64
+#else
+void __ovld vstore_half(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __global half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __global half *p);
+void __ovld vstore_half(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __local half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __local half *p);
+void __ovld vstore_half(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rte(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtz(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtp(float data, size_t offset, __private half *p);
+void __ovld vstore_half_rtn(float data, size_t offset, __private half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __global half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __global half *p);
+void __ovld vstore_half(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __local half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __local half *p);
+void __ovld vstore_half(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rte(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtz(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtp(double data, size_t offset, __private half *p);
+void __ovld vstore_half_rtn(double data, size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The floatn value given by data is converted to
+ * a halfn value using the appropriate rounding
+ * mode. The halfn value is then written to
+ * address computed as (p + (offset * n)). The
+ * address computed as (p + (offset * n)) must be
+ * 16-bit aligned.
+ * vstore_halfn uses the current rounding mode.
+ * The default current rounding mode is round to
+ * nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstore_half2(float2 data, size_t offset, half *p);
+void __ovld vstore_half3(float3 data, size_t offset, half *p);
+void __ovld vstore_half4(float4 data, size_t offset, half *p);
+void __ovld vstore_half8(float8 data, size_t offset, half *p);
+void __ovld vstore_half16(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half2(double2 data, size_t offset, half *p);
+void __ovld vstore_half3(double3 data, size_t offset, half *p);
+void __ovld vstore_half4(double4 data, size_t offset, half *p);
+void __ovld vstore_half8(double8 data, size_t offset, half *p);
+void __ovld vstore_half16(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, half *p);
+#endif //cl_khr_fp64
+#else
+void __ovld vstore_half2(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __global half *p);
+void __ovld vstore_half2(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __local half *p);
+void __ovld vstore_half2(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rte(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rte(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rte(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rte(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rte(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtz(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtz(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtz(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtz(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtz(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtp(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtp(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtp(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtp(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtp(float16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtn(float2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtn(float3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtn(float4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtn(float8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtn(float16 data, size_t offset, __private half *p);
+#ifdef cl_khr_fp64
+void __ovld vstore_half2(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __global half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __global half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __global half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __global half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __global half *p);
+void __ovld vstore_half2(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __local half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __local half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __local half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __local half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __local half *p);
+void __ovld vstore_half2(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rte(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rte(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rte(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rte(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rte(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtz(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtz(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtz(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtz(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtz(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtp(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtp(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtp(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtp(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtp(double16 data, size_t offset, __private half *p);
+void __ovld vstore_half2_rtn(double2 data, size_t offset, __private half *p);
+void __ovld vstore_half3_rtn(double3 data, size_t offset, __private half *p);
+void __ovld vstore_half4_rtn(double4 data, size_t offset, __private half *p);
+void __ovld vstore_half8_rtn(double8 data, size_t offset, __private half *p);
+void __ovld vstore_half16_rtn(double16 data, size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * For n = 1, 2, 4, 8 and 16 read sizeof (halfn)
+ * bytes of data from address (p + (offset * n)).
+ * The data read is interpreted as a halfn value.
+ * The halfn value read is converted to a floatn
+ * value and the floatn value is returned.
+ * The address computed as (p + (offset * n))
+ * must be aligned to sizeof (halfn) bytes.
+ * For n = 3, vloada_half3 reads a half3 from
+ * address (p + (offset * 4)) and returns a float3.
+ * The address computed as (p + (offset * 4))
+ * must be aligned to sizeof (half) * 4 bytes.
+ */
+float __ovld vloada_half(size_t offset, const __constant half *p);
+float2 __ovld vloada_half2(size_t offset, const __constant half *p);
+float3 __ovld vloada_half3(size_t offset, const __constant half *p);
+float4 __ovld vloada_half4(size_t offset, const __constant half *p);
+float8 __ovld vloada_half8(size_t offset, const __constant half *p);
+float16 __ovld vloada_half16(size_t offset, const __constant half *p);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float __ovld vloada_half(size_t offset, const half *p);
+float2 __ovld vloada_half2(size_t offset, const half *p);
+float3 __ovld vloada_half3(size_t offset, const half *p);
+float4 __ovld vloada_half4(size_t offset, const half *p);
+float8 __ovld vloada_half8(size_t offset, const half *p);
+float16 __ovld vloada_half16(size_t offset, const half *p);
+#else
+float __ovld vloada_half(size_t offset, const __global half *p);
+float2 __ovld vloada_half2(size_t offset, const __global half *p);
+float3 __ovld vloada_half3(size_t offset, const __global half *p);
+float4 __ovld vloada_half4(size_t offset, const __global half *p);
+float8 __ovld vloada_half8(size_t offset, const __global half *p);
+float16 __ovld vloada_half16(size_t offset, const __global half *p);
+float __ovld vloada_half(size_t offset, const __local half *p);
+float2 __ovld vloada_half2(size_t offset, const __local half *p);
+float3 __ovld vloada_half3(size_t offset, const __local half *p);
+float4 __ovld vloada_half4(size_t offset, const __local half *p);
+float8 __ovld vloada_half8(size_t offset, const __local half *p);
+float16 __ovld vloada_half16(size_t offset, const __local half *p);
+float __ovld vloada_half(size_t offset, const __private half *p);
+float2 __ovld vloada_half2(size_t offset, const __private half *p);
+float3 __ovld vloada_half3(size_t offset, const __private half *p);
+float4 __ovld vloada_half4(size_t offset, const __private half *p);
+float8 __ovld vloada_half8(size_t offset, const __private half *p);
+float16 __ovld vloada_half16(size_t offset, const __private half *p);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * The floatn value given by data is converted to
+ * a halfn value using the appropriate rounding
+ * mode.
+ * For n = 1, 2, 4, 8 and 16, the halfn value is
+ * written to the address computed as (p + (offset
+ * * n)). The address computed as (p + (offset *
+ * n)) must be aligned to sizeof (halfn) bytes.
+ * For n = 3, the half3 value is written to the
+ * address computed as (p + (offset * 4)). The
+ * address computed as (p + (offset * 4)) must be
+ * aligned to sizeof (half) * 4 bytes.
+ * vstorea_halfn uses the current rounding
+ * mode. The default current rounding mode is
+ * round to nearest even.
+ */
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld vstorea_half(float data, size_t offset, half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, half *p);
+
+#ifdef cl_khr_fp64
+void __ovld vstorea_half(double data, size_t offset, half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, half *p);
+#endif //cl_khr_fp64
+
+#else
+void __ovld vstorea_half(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rte(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rte(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rte(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rte(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rte(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rte(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtz(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtz(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtz(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtz(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtz(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtz(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtp(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtp(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtp(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtp(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtp(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtp(float16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtn(float data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtn(float2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtn(float3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtn(float4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtn(float8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtn(float16 data, size_t offset, __private half *p);
+
+#ifdef cl_khr_fp64
+void __ovld vstorea_half(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __global half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, __global half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, __global half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, __global half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, __global half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, __global half *p);
+
+void __ovld vstorea_half(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __local half *p);
+void __ovld vstorea_half2_rtn(double2 data, size_t offset, __local half *p);
+void __ovld vstorea_half3_rtn(double3 data, size_t offset, __local half *p);
+void __ovld vstorea_half4_rtn(double4 data, size_t offset, __local half *p);
+void __ovld vstorea_half8_rtn(double8 data, size_t offset, __local half *p);
+void __ovld vstorea_half16_rtn(double16 data, size_t offset, __local half *p);
+
+void __ovld vstorea_half(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rte(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rte(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rte(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rte(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rte(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rte(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtz(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtz(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtz(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtz(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtz(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtz(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtp(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtp(double2 data, size_t offset, __private half *p);
+void __ovld vstorea_half3_rtp(double3 data, size_t offset, __private half *p);
+void __ovld vstorea_half4_rtp(double4 data, size_t offset, __private half *p);
+void __ovld vstorea_half8_rtp(double8 data, size_t offset, __private half *p);
+void __ovld vstorea_half16_rtp(double16 data, size_t offset, __private half *p);
+
+void __ovld vstorea_half_rtn(double data, size_t offset, __private half *p);
+void __ovld vstorea_half2_rtn(double2 data,size_t offset, __private half *p);
+void __ovld vstorea_half3_rtn(double3 data,size_t offset, __private half *p);
+void __ovld vstorea_half4_rtn(double4 data,size_t offset, __private half *p);
+void __ovld vstorea_half8_rtn(double8 data,size_t offset, __private half *p);
+void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
+#endif //cl_khr_fp64
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
+
+// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
+typedef uint cl_mem_fence_flags;
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to local memory
+ */
+#define CLK_LOCAL_MEM_FENCE    0x01
+
+/**
+ * Queue a memory fence to ensure correct
+ * ordering of memory operations to global memory
+ */
+#define CLK_GLOBAL_MEM_FENCE   0x02
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+/**
+ * Queue a memory fence to ensure correct ordering of memory
+ * operations between work-items of a work-group to
+ * image memory.
+ */
+#define CLK_IMAGE_MEM_FENCE  0x04
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * All work-items in a work-group executing the kernel
+ * on a processor must execute this function before any
+ * are allowed to continue execution beyond the barrier.
+ * This function must be encountered by all work-items in
+ * a work-group executing the kernel.
+ * If barrier is inside a conditional statement, then all
+ * work-items must enter the conditional if any work-item
+ * enters the conditional statement and executes the
+ * barrier.
+ * If barrer is inside a loop, all work-items must execute
+ * the barrier for each iteration of the loop before any are
+ * allowed to continue execution beyond the barrier.
+ * The barrier function also queues a memory fence
+ * (reads and writes) to ensure correct ordering of
+ * memory operations to local or global memory.
+ * The flags argument specifies the memory address space
+ * and can be set to a combination of the following literal
+ * values.
+ * CLK_LOCAL_MEM_FENCE - The barrier function
+ * will either flush any variables stored in local memory
+ * or queue a memory fence to ensure correct ordering of
+ * memory operations to local memory.
+ * CLK_GLOBAL_MEM_FENCE - The barrier function
+ * will queue a memory fence to ensure correct ordering
+ * of memory operations to global memory. This can be
+ * useful when work-items, for example, write to buffer or
+ * image objects and then want to read the updated data.
+ */
+
+void __ovld __conv barrier(cl_mem_fence_flags flags);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+typedef enum memory_scope
+{
+  memory_scope_work_item,
+  memory_scope_work_group,
+  memory_scope_device,
+  memory_scope_all_svm_devices,
+  memory_scope_sub_group
+} memory_scope;
+
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
+
+/**
+ * Orders loads and stores of a work-item
+ * executing a kernel. This means that loads
+ * and stores preceding the mem_fence will
+ * be committed to memory before any loads
+ * and stores following the mem_fence.
+ * The flags argument specifies the memory
+ * address space and can be set to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld mem_fence(cl_mem_fence_flags flags);
+
+/**
+ * Read memory barrier that orders only
+ * loads.
+ * The flags argument specifies the memory
+ * address space and can be set to to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld read_mem_fence(cl_mem_fence_flags flags);
+
+/**
+ * Write memory barrier that orders only
+ * stores.
+ * The flags argument specifies the memory
+ * address space and can be set to to a
+ * combination of the following literal
+ * values:
+ * CLK_LOCAL_MEM_FENCE
+ * CLK_GLOBAL_MEM_FENCE.
+ */
+void __ovld write_mem_fence(cl_mem_fence_flags flags);
+
+// OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+cl_mem_fence_flags __ovld get_fence(const void *ptr);
+cl_mem_fence_flags __ovld get_fence(void *ptr);
+
+/** 
+ * Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions
+ * and checked in Sema since they should be declared as
+ *   addr gentype* to_addr (gentype*);
+ * where gentype is builtin type or user defined type.
+ */
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.10, v1.2 s6.12.10, v2.0 s6.13.10 - Async Copies from Global to Local Memory, Local to Global Memory, and Prefetch
+
+/**
+ * event_t async_work_group_copy (
+ * __global gentype *dst,
+ * const __local gentype *src,
+ * size_t num_elements,
+ * event_t event)
+ * Perform an async copy of num_elements
+ * gentype elements from src to dst. The async
+ * copy is performed by all work-items in a workgroup
+ * and this built-in function must therefore
+ * be encountered by all work-items in a workgroup
+ * executing the kernel with the same
+ * argument values; otherwise the results are
+ * undefined.
+ * Returns an event object that can be used by
+ * wait_group_events to wait for the async copy
+ * to finish. The event argument can also be used
+ * to associate the async_work_group_copy with
+ * a previous async copy allowing an event to be
+ * shared by multiple async copies; otherwise event
+ * should be zero.
+ * If event argument is non-zero, the event object
+ * supplied in event argument will be returned.
+ * This function does not perform any implicit
+ * synchronization of source data such as using a
+ * barrier before performing the copy.
+ */
+event_t __ovld async_work_group_copy(__local char *dst, const __global char *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar *dst, const __global uchar *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short *dst, const __global short *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort *dst, const __global ushort *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int *dst, const __global int *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint *dst, const __global uint *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long *dst, const __global long *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong *dst, const __global ulong *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float *dst, const __global float *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char2 *dst, const __global char2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar2 *dst, const __global uchar2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short2 *dst, const __global short2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort2 *dst, const __global ushort2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int2 *dst, const __global int2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint2 *dst, const __global uint2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long2 *dst, const __global long2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong2 *dst, const __global ulong2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float2 *dst, const __global float2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char3 *dst, const __global char3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar3 *dst, const __global uchar3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short3 *dst, const __global short3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort3 *dst, const __global ushort3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int3 *dst, const __global int3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint3 *dst, const __global uint3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long3 *dst, const __global long3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong3 *dst, const __global ulong3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float3 *dst, const __global float3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char4 *dst, const __global char4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar4 *dst, const __global uchar4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short4 *dst, const __global short4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort4 *dst, const __global ushort4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int4 *dst, const __global int4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint4 *dst, const __global uint4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long4 *dst, const __global long4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong4 *dst, const __global ulong4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float4 *dst, const __global float4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char8 *dst, const __global char8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar8 *dst, const __global uchar8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short8 *dst, const __global short8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort8 *dst, const __global ushort8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int8 *dst, const __global int8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint8 *dst, const __global uint8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long8 *dst, const __global long8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong8 *dst, const __global ulong8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float8 *dst, const __global float8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local char16 *dst, const __global char16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uchar16 *dst, const __global uchar16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local short16 *dst, const __global short16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ushort16 *dst, const __global ushort16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local int16 *dst, const __global int16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local uint16 *dst, const __global uint16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local long16 *dst, const __global long16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local ulong16 *dst, const __global ulong16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local float16 *dst, const __global float16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char *dst, const __local char *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar *dst, const __local uchar *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short *dst, const __local short *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort *dst, const __local ushort *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int *dst, const __local int *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint *dst, const __local uint *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long *dst, const __local long *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong *dst, const __local ulong *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float *dst, const __local float *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char2 *dst, const __local char2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar2 *dst, const __local uchar2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short2 *dst, const __local short2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort2 *dst, const __local ushort2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int2 *dst, const __local int2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint2 *dst, const __local uint2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long2 *dst, const __local long2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong2 *dst, const __local ulong2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float2 *dst, const __local float2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char3 *dst, const __local char3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar3 *dst, const __local uchar3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short3 *dst, const __local short3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort3 *dst, const __local ushort3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int3 *dst, const __local int3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint3 *dst, const __local uint3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long3 *dst, const __local long3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong3 *dst, const __local ulong3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float3 *dst, const __local float3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char4 *dst, const __local char4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar4 *dst, const __local uchar4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short4 *dst, const __local short4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort4 *dst, const __local ushort4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int4 *dst, const __local int4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint4 *dst, const __local uint4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long4 *dst, const __local long4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong4 *dst, const __local ulong4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float4 *dst, const __local float4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char8 *dst, const __local char8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar8 *dst, const __local uchar8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short8 *dst, const __local short8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort8 *dst, const __local ushort8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int8 *dst, const __local int8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint8 *dst, const __local uint8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long8 *dst, const __local long8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong8 *dst, const __local ulong8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float8 *dst, const __local float8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global char16 *dst, const __local char16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uchar16 *dst, const __local uchar16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global short16 *dst, const __local short16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ushort16 *dst, const __local ushort16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global int16 *dst, const __local int16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global uint16 *dst, const __local uint16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, event_t event);
+#ifdef cl_khr_fp64
+event_t __ovld async_work_group_copy(__local double *dst, const __global double *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double4 *dst, const __global double4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double8 *dst, const __global double8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local double16 *dst, const __global double16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double *dst, const __local double *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double2 *dst, const __local double2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double3 *dst, const __local double3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, event_t event);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+event_t __ovld async_work_group_copy(__local half *dst, const __global half *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half3 *dst, const __global half3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half4 *dst, const __global half4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half8 *dst, const __global half8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__local half16 *dst, const __global half16 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half *dst, const __local half *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half2 *dst, const __local half2 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half3 *dst, const __local half3 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half4 *dst, const __local half4 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half8 *dst, const __local half8 *src, size_t num_elements, event_t event);
+event_t __ovld async_work_group_copy(__global half16 *dst, const __local half16 *src, size_t num_elements, event_t event);
+#endif //cl_khr_fp16
+
+/**
+ * Perform an async gather of num_elements
+ * gentype elements from src to dst. The
+ * src_stride is the stride in elements for each
+ * gentype element read from src. The dst_stride
+ * is the stride in elements for each gentype
+ * element written to dst. The async gather is
+ * performed by all work-items in a work-group.
+ * This built-in function must therefore be
+ * encountered by all work-items in a work-group
+ * executing the kernel with the same argument
+ * values; otherwise the results are undefined.
+ * Returns an event object that can be used by
+ * wait_group_events to wait for the async copy
+ * to finish. The event argument can also be used
+ * to associate the
+ * async_work_group_strided_copy with a
+ * previous async copy allowing an event to be
+ * shared by multiple async copies; otherwise event
+ * should be zero.
+ * If event argument is non-zero, the event object
+ * supplied in event argument will be returned.
+ * This function does not perform any implicit
+ * synchronization of source data such as using a
+ * barrier before performing the copy.
+ */
+event_t __ovld async_work_group_strided_copy(__local char *dst, const __global char *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar *dst, const __global uchar *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short *dst, const __global short *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort *dst, const __global ushort *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int *dst, const __global int *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint *dst, const __global uint *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long *dst, const __global long *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong *dst, const __global ulong *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float *dst, const __global float *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char2 *dst, const __global char2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar2 *dst, const __global uchar2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short2 *dst, const __global short2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort2 *dst, const __global ushort2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int2 *dst, const __global int2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint2 *dst, const __global uint2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long2 *dst, const __global long2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong2 *dst, const __global ulong2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float2 *dst, const __global float2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char3 *dst, const __global char3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar3 *dst, const __global uchar3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short3 *dst, const __global short3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort3 *dst, const __global ushort3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int3 *dst, const __global int3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint3 *dst, const __global uint3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long3 *dst, const __global long3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong3 *dst, const __global ulong3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float3 *dst, const __global float3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char4 *dst, const __global char4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar4 *dst, const __global uchar4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short4 *dst, const __global short4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort4 *dst, const __global ushort4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int4 *dst, const __global int4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint4 *dst, const __global uint4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long4 *dst, const __global long4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong4 *dst, const __global ulong4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float4 *dst, const __global float4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char8 *dst, const __global char8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar8 *dst, const __global uchar8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short8 *dst, const __global short8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort8 *dst, const __global ushort8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int8 *dst, const __global int8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint8 *dst, const __global uint8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long8 *dst, const __global long8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong8 *dst, const __global ulong8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float8 *dst, const __global float8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local char16 *dst, const __global char16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uchar16 *dst, const __global uchar16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local short16 *dst, const __global short16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ushort16 *dst, const __global ushort16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local int16 *dst, const __global int16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local uint16 *dst, const __global uint16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local long16 *dst, const __global long16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local ulong16 *dst, const __global ulong16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local float16 *dst, const __global float16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char *dst, const __local char *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar *dst, const __local uchar *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short *dst, const __local short *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort *dst, const __local ushort *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int *dst, const __local int *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint *dst, const __local uint *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long *dst, const __local long *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong *dst, const __local ulong *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float *dst, const __local float *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char2 *dst, const __local char2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar2 *dst, const __local uchar2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short2 *dst, const __local short2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort2 *dst, const __local ushort2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int2 *dst, const __local int2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint2 *dst, const __local uint2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long2 *dst, const __local long2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong2 *dst, const __local ulong2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float2 *dst, const __local float2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char3 *dst, const __local char3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar3 *dst, const __local uchar3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short3 *dst, const __local short3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort3 *dst, const __local ushort3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int3 *dst, const __local int3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint3 *dst, const __local uint3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long3 *dst, const __local long3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong3 *dst, const __local ulong3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float3 *dst, const __local float3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char4 *dst, const __local char4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar4 *dst, const __local uchar4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short4 *dst, const __local short4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort4 *dst, const __local ushort4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int4 *dst, const __local int4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint4 *dst, const __local uint4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long4 *dst, const __local long4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong4 *dst, const __local ulong4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float4 *dst, const __local float4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char8 *dst, const __local char8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar8 *dst, const __local uchar8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short8 *dst, const __local short8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort8 *dst, const __local ushort8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int8 *dst, const __local int8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint8 *dst, const __local uint8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long8 *dst, const __local long8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong8 *dst, const __local ulong8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float8 *dst, const __local float8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global char16 *dst, const __local char16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uchar16 *dst, const __local uchar16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global short16 *dst, const __local short16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ushort16 *dst, const __local ushort16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global int16 *dst, const __local int16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global uint16 *dst, const __local uint16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#ifdef cl_khr_fp64
+event_t __ovld async_work_group_strided_copy(__local double *dst, const __global double *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double4 *dst, const __global double4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double8 *dst, const __global double8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local double16 *dst, const __global double16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double *dst, const __local double *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double2 *dst, const __local double2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double3 *dst, const __local double3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+event_t __ovld async_work_group_strided_copy(__local half *dst, const __global half *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half3 *dst, const __global half3 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half4 *dst, const __global half4 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half8 *dst, const __global half8 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__local half16 *dst, const __global half16 *src, size_t num_elements, size_t src_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half *dst, const __local half *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half2 *dst, const __local half2 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half3 *dst, const __local half3 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half4 *dst, const __local half4 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half8 *dst, const __local half8 *src, size_t num_elements, size_t dst_stride, event_t event);
+event_t __ovld async_work_group_strided_copy(__global half16 *dst, const __local half16 *src, size_t num_elements, size_t dst_stride, event_t event);
+#endif //cl_khr_fp16
+
+/**
+ * Wait for events that identify the
+ * async_work_group_copy operations to
+ * complete. The event objects specified in
+ * event_list will be released after the wait is
+ * performed.
+ * This function must be encountered by all workitems
+ * in a work-group executing the kernel with
+ * the same num_events and event objects specified
+ * in event_list; otherwise the results are undefined.
+ */
+void __ovld wait_group_events(int num_events, event_t *event_list);
+
+/**
+ * Prefetch num_elements * sizeof(gentype)
+ * bytes into the global cache. The prefetch
+ * instruction is applied to a work-item in a workgroup
+ * and does not affect the functional
+ * behavior of the kernel.
+ */
+void __ovld prefetch(const __global char *p, size_t num_elements);
+void __ovld prefetch(const __global uchar *p, size_t num_elements);
+void __ovld prefetch(const __global short *p, size_t num_elements);
+void __ovld prefetch(const __global ushort *p, size_t num_elements);
+void __ovld prefetch(const __global int *p, size_t num_elements);
+void __ovld prefetch(const __global uint *p, size_t num_elements);
+void __ovld prefetch(const __global long *p, size_t num_elements);
+void __ovld prefetch(const __global ulong *p, size_t num_elements);
+void __ovld prefetch(const __global float *p, size_t num_elements);
+void __ovld prefetch(const __global char2 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar2 *p, size_t num_elements);
+void __ovld prefetch(const __global short2 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort2 *p, size_t num_elements);
+void __ovld prefetch(const __global int2 *p, size_t num_elements);
+void __ovld prefetch(const __global uint2 *p, size_t num_elements);
+void __ovld prefetch(const __global long2 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong2 *p, size_t num_elements);
+void __ovld prefetch(const __global float2 *p, size_t num_elements);
+void __ovld prefetch(const __global char3 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar3 *p, size_t num_elements);
+void __ovld prefetch(const __global short3 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort3 *p, size_t num_elements);
+void __ovld prefetch(const __global int3 *p, size_t num_elements);
+void __ovld prefetch(const __global uint3 *p, size_t num_elements);
+void __ovld prefetch(const __global long3 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong3 *p, size_t num_elements);
+void __ovld prefetch(const __global float3 *p, size_t num_elements);
+void __ovld prefetch(const __global char4 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar4 *p, size_t num_elements);
+void __ovld prefetch(const __global short4 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort4 *p, size_t num_elements);
+void __ovld prefetch(const __global int4 *p, size_t num_elements);
+void __ovld prefetch(const __global uint4 *p, size_t num_elements);
+void __ovld prefetch(const __global long4 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong4 *p, size_t num_elements);
+void __ovld prefetch(const __global float4 *p, size_t num_elements);
+void __ovld prefetch(const __global char8 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar8 *p, size_t num_elements);
+void __ovld prefetch(const __global short8 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort8 *p, size_t num_elements);
+void __ovld prefetch(const __global int8 *p, size_t num_elements);
+void __ovld prefetch(const __global uint8 *p, size_t num_elements);
+void __ovld prefetch(const __global long8 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong8 *p, size_t num_elements);
+void __ovld prefetch(const __global float8 *p, size_t num_elements);
+void __ovld prefetch(const __global char16 *p, size_t num_elements);
+void __ovld prefetch(const __global uchar16 *p, size_t num_elements);
+void __ovld prefetch(const __global short16 *p, size_t num_elements);
+void __ovld prefetch(const __global ushort16 *p, size_t num_elements);
+void __ovld prefetch(const __global int16 *p, size_t num_elements);
+void __ovld prefetch(const __global uint16 *p, size_t num_elements);
+void __ovld prefetch(const __global long16 *p, size_t num_elements);
+void __ovld prefetch(const __global ulong16 *p, size_t num_elements);
+void __ovld prefetch(const __global float16 *p, size_t num_elements);
+#ifdef cl_khr_fp64
+void __ovld prefetch(const __global double *p, size_t num_elements);
+void __ovld prefetch(const __global double2 *p, size_t num_elements);
+void __ovld prefetch(const __global double3 *p, size_t num_elements);
+void __ovld prefetch(const __global double4 *p, size_t num_elements);
+void __ovld prefetch(const __global double8 *p, size_t num_elements);
+void __ovld prefetch(const __global double16 *p, size_t num_elements);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+void __ovld prefetch(const __global half *p, size_t num_elements);
+void __ovld prefetch(const __global half2 *p, size_t num_elements);
+void __ovld prefetch(const __global half3 *p, size_t num_elements);
+void __ovld prefetch(const __global half4 *p, size_t num_elements);
+void __ovld prefetch(const __global half8 *p, size_t num_elements);
+void __ovld prefetch(const __global half16 *p, size_t num_elements);
+#endif // cl_khr_fp16
+
+// OpenCL v1.1 s6.11.1, v1.2 s6.12.11 - Atomic Functions
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#endif
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old + val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_add(volatile __global int *p, int val);
+unsigned int __ovld atomic_add(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_add(volatile __local int *p, int val);
+unsigned int __ovld atomic_add(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_add(volatile __global int *p, int val);
+unsigned int __ovld atom_add(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_add(volatile __local int *p, int val);
+unsigned int __ovld atom_add(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_add(volatile __global long *p, long val);
+unsigned long __ovld atom_add(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_add(volatile __local long *p, long val);
+unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old) stored at location pointed by p.
+ * Compute (old - val) and store result at location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_sub(volatile __global int *p, int val);
+unsigned int __ovld atomic_sub(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_sub(volatile __local int *p, int val);
+unsigned int __ovld atomic_sub(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_sub(volatile __global int *p, int val);
+unsigned int __ovld atom_sub(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_sub(volatile __local int *p, int val);
+unsigned int __ovld atom_sub(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_sub(volatile __global long *p, long val);
+unsigned long __ovld atom_sub(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_sub(volatile __local long *p, long val);
+unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Swaps the old value stored at location p
+ * with new value given by val. Returns old
+ * value.
+ */
+int __ovld atomic_xchg(volatile __global int *p, int val);
+unsigned int __ovld atomic_xchg(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_xchg(volatile __local int *p, int val);
+unsigned int __ovld atomic_xchg(volatile __local unsigned int *p, unsigned int val);
+float __ovld atomic_xchg(volatile __global float *p, float val);
+float __ovld atomic_xchg(volatile __local float *p, float val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_xchg(volatile __global int *p, int val);
+int __ovld atom_xchg(volatile __local int *p, int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
+unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_xchg(volatile __global long *p, long val);
+long __ovld atom_xchg(volatile __local long *p, long val);
+unsigned long __ovld atom_xchg(volatile __global unsigned long *p, unsigned long val);
+unsigned long __ovld atom_xchg(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old + 1) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_inc(volatile __global int *p);
+unsigned int __ovld atomic_inc(volatile __global unsigned int *p);
+int __ovld atomic_inc(volatile __local int *p);
+unsigned int __ovld atomic_inc(volatile __local unsigned int *p);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_inc(volatile __global int *p);
+unsigned int __ovld atom_inc(volatile __global unsigned int *p);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_inc(volatile __local int *p);
+unsigned int __ovld atom_inc(volatile __local unsigned int *p);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_inc(volatile __global long *p);
+unsigned long __ovld atom_inc(volatile __global unsigned long *p);
+long __ovld atom_inc(volatile __local long *p);
+unsigned long __ovld atom_inc(volatile __local unsigned long *p);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old - 1) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_dec(volatile __global int *p);
+unsigned int __ovld atomic_dec(volatile __global unsigned int *p);
+int __ovld atomic_dec(volatile __local int *p);
+unsigned int __ovld atomic_dec(volatile __local unsigned int *p);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_dec(volatile __global int *p);
+unsigned int __ovld atom_dec(volatile __global unsigned int *p);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_dec(volatile __local int *p);
+unsigned int __ovld atom_dec(volatile __local unsigned int *p);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_dec(volatile __global long *p);
+unsigned long __ovld atom_dec(volatile __global unsigned long *p);
+long __ovld atom_dec(volatile __local long *p);
+unsigned long __ovld atom_dec(volatile __local unsigned long *p);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old == cmp) ? val : old and store result at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
+unsigned int __ovld atomic_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
+unsigned int __ovld atomic_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+
+#if defined(cl_khr_global_int32_base_atomics)
+int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
+unsigned int __ovld atom_cmpxchg(volatile __global unsigned int *p, unsigned int cmp, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_base_atomics)
+int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
+unsigned int __ovld atom_cmpxchg(volatile __local unsigned int *p, unsigned int cmp, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics)
+long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
+unsigned long __ovld atom_cmpxchg(volatile __global unsigned long *p, unsigned long cmp, unsigned long val);
+long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
+unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned long cmp, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * min(old, val) and store minimum value at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_min(volatile __global int *p, int val);
+unsigned int __ovld atomic_min(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_min(volatile __local int *p, int val);
+unsigned int __ovld atomic_min(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_min(volatile __global int *p, int val);
+unsigned int __ovld atom_min(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_min(volatile __local int *p, int val);
+unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_min(volatile __global long *p, long val);
+unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+long __ovld atom_min(volatile __local long *p, long val);
+unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * max(old, val) and store maximum value at
+ * location pointed by p. The function
+ * returns old.
+ */
+int __ovld atomic_max(volatile __global int *p, int val);
+unsigned int __ovld atomic_max(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_max(volatile __local int *p, int val);
+unsigned int __ovld atomic_max(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_max(volatile __global int *p, int val);
+unsigned int __ovld atom_max(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_max(volatile __local int *p, int val);
+unsigned int __ovld atom_max(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_max(volatile __global long *p, long val);
+unsigned long __ovld atom_max(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_max(volatile __local long *p, long val);
+unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old & val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_and(volatile __global int *p, int val);
+unsigned int __ovld atomic_and(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_and(volatile __local int *p, int val);
+unsigned int __ovld atomic_and(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_and(volatile __global int *p, int val);
+unsigned int __ovld atom_and(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_and(volatile __local int *p, int val);
+unsigned int __ovld atom_and(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_and(volatile __global long *p, long val);
+unsigned long __ovld atom_and(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_and(volatile __local long *p, long val);
+unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old | val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_or(volatile __global int *p, int val);
+unsigned int __ovld atomic_or(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_or(volatile __local int *p, int val);
+unsigned int __ovld atomic_or(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_or(volatile __global int *p, int val);
+unsigned int __ovld atom_or(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_or(volatile __local int *p, int val);
+unsigned int __ovld atom_or(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_or(volatile __global long *p, long val);
+unsigned long __ovld atom_or(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_or(volatile __local long *p, long val);
+unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+/**
+ * Read the 32-bit value (referred to as old)
+ * stored at location pointed by p. Compute
+ * (old ^ val) and store result at location
+ * pointed by p. The function returns old.
+ */
+int __ovld atomic_xor(volatile __global int *p, int val);
+unsigned int __ovld atomic_xor(volatile __global unsigned int *p, unsigned int val);
+int __ovld atomic_xor(volatile __local int *p, int val);
+unsigned int __ovld atomic_xor(volatile __local unsigned int *p, unsigned int val);
+
+#if defined(cl_khr_global_int32_extended_atomics)
+int __ovld atom_xor(volatile __global int *p, int val);
+unsigned int __ovld atom_xor(volatile __global unsigned int *p, unsigned int val);
+#endif
+#if defined(cl_khr_local_int32_extended_atomics)
+int __ovld atom_xor(volatile __local int *p, int val);
+unsigned int __ovld atom_xor(volatile __local unsigned int *p, unsigned int val);
+#endif
+
+#if defined(cl_khr_int64_extended_atomics)
+long __ovld atom_xor(volatile __global long *p, long val);
+unsigned long __ovld atom_xor(volatile __global unsigned long *p, unsigned long val);
+long __ovld atom_xor(volatile __local long *p, long val);
+unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long val);
+#endif
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
+#endif
+
+// OpenCL v2.0 s6.13.11 - Atomics Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#ifndef ATOMIC_VAR_INIT
+#define ATOMIC_VAR_INIT(x) (x)
+#endif //ATOMIC_VAR_INIT
+#define ATOMIC_FLAG_INIT 0
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum memory_order
+{
+  memory_order_relaxed,
+  memory_order_acquire,
+  memory_order_release,
+  memory_order_acq_rel,
+  memory_order_seq_cst
+} memory_order;
+
+// double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+#endif
+
+// atomic_init()
+void __ovld atomic_init(volatile atomic_int *object, int value);
+void __ovld atomic_init(volatile atomic_uint *object, uint value);
+void __ovld atomic_init(volatile atomic_float *object, float value);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+void __ovld atomic_init(volatile atomic_long *object, long value);
+void __ovld atomic_init(volatile atomic_ulong *object, ulong value);
+#ifdef cl_khr_fp64
+void __ovld atomic_init(volatile atomic_double *object, double value);
+#endif //cl_khr_fp64
+#endif
+
+// atomic_work_item_fence()
+void __ovld atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
+
+// atomic_fetch()
+
+int __ovld atomic_fetch_add(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_add_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_add(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_add_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_sub(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_sub_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_sub(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_sub_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_or(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_or_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_or(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_or_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_xor(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_xor_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_xor(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_xor_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_and(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_and_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_and(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_and_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_min(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_min(volatile atomic_uint *object, int operand);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order);
+uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
+int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
+int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
+uint __ovld atomic_fetch_max(volatile atomic_uint *object, int operand);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order);
+uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_add_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_add(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_add_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_sub(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_sub_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_sub(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_sub_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_or(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_or_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_or(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_or_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_xor(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_xor_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_xor(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_xor_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_and(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_and_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_and(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_and_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_min(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, long operand);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order);
+ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
+long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
+long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
+ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, long operand);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order);
+ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
+#endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+
+// OpenCL v2.0 s6.13.11.7.5:
+// add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t.
+// or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t.
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) 
+uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_sub(volatile atomic_uintptr_t *object, ptrdiff_t operand);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_sub_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
+
+uintptr_t __ovld atomic_fetch_or(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_or_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_xor(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_and(volatile atomic_uintptr_t *object, intptr_t operand);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order);
+uintptr_t __ovld atomic_fetch_and_explicit(volatile atomic_uintptr_t *object, intptr_t operand, memory_order order, memory_scope scope);
+uintptr_t __ovld atomic_fetch_min(volatile atomic_uintptr_t *object, intptr_t opermax);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
+uintptr_t __ovld atomic_fetch_min_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);
+uintptr_t __ovld atomic_fetch_max(volatile atomic_uintptr_t *object, intptr_t opermax);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder);
+uintptr_t __ovld atomic_fetch_max_explicit(volatile atomic_uintptr_t *object, intptr_t opermax, memory_order minder, memory_scope scope);
+
+intptr_t __ovld atomic_fetch_or(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_or_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_xor(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_xor_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_and(volatile atomic_intptr_t *object, uintptr_t operand);
+intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order);
+intptr_t __ovld atomic_fetch_and_explicit(volatile atomic_intptr_t *object, uintptr_t operand, memory_order order, memory_scope scope);
+intptr_t __ovld atomic_fetch_min(volatile atomic_intptr_t *object, uintptr_t opermax);
+intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
+intptr_t __ovld atomic_fetch_min_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
+intptr_t __ovld atomic_fetch_max(volatile atomic_intptr_t *object, uintptr_t opermax);
+intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder);
+intptr_t __ovld atomic_fetch_max_explicit(volatile atomic_intptr_t *object, uintptr_t opermax, memory_order minder, memory_scope scope);
+#endif
+
+// atomic_store()
+
+void __ovld atomic_store(volatile atomic_int *object, int desired);
+void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_uint *object, uint desired);
+void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_float *object, float desired);
+void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+void __ovld atomic_store(volatile atomic_double *object, double desired);
+void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+void __ovld atomic_store(volatile atomic_long *object, long desired);
+void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
+void __ovld atomic_store(volatile atomic_ulong *object, ulong desired);
+void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+void __ovld atomic_store_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+#endif
+
+// atomic_load()
+
+int __ovld atomic_load(volatile atomic_int *object);
+int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order);
+int __ovld atomic_load_explicit(volatile atomic_int *object, memory_order order, memory_scope scope);
+uint __ovld atomic_load(volatile atomic_uint *object);
+uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order);
+uint __ovld atomic_load_explicit(volatile atomic_uint *object, memory_order order, memory_scope scope);
+float __ovld atomic_load(volatile atomic_float *object);
+float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order);
+float __ovld atomic_load_explicit(volatile atomic_float *object, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+double __ovld atomic_load(volatile atomic_double *object);
+double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order);
+double __ovld atomic_load_explicit(volatile atomic_double *object, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+long __ovld atomic_load(volatile atomic_long *object);
+long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order);
+long __ovld atomic_load_explicit(volatile atomic_long *object, memory_order order, memory_scope scope);
+ulong __ovld atomic_load(volatile atomic_ulong *object);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order);
+ulong __ovld atomic_load_explicit(volatile atomic_ulong *object, memory_order order, memory_scope scope);
+#endif
+
+// atomic_exchange()
+
+int __ovld atomic_exchange(volatile atomic_int *object, int desired);
+int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order);
+int __ovld atomic_exchange_explicit(volatile atomic_int *object, int desired, memory_order order, memory_scope scope);
+uint __ovld atomic_exchange(volatile atomic_uint *object, uint desired);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order);
+uint __ovld atomic_exchange_explicit(volatile atomic_uint *object, uint desired, memory_order order, memory_scope scope);
+float __ovld atomic_exchange(volatile atomic_float *object, float desired);
+float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order);
+float __ovld atomic_exchange_explicit(volatile atomic_float *object, float desired, memory_order order, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+double __ovld atomic_exchange(volatile atomic_double *object, double desired);
+double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order);
+double __ovld atomic_exchange_explicit(volatile atomic_double *object, double desired, memory_order order, memory_scope scope);
+#endif //cl_khr_fp64
+long __ovld atomic_exchange(volatile atomic_long *object, long desired);
+long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order);
+long __ovld atomic_exchange_explicit(volatile atomic_long *object, long desired, memory_order order, memory_scope scope);
+ulong __ovld atomic_exchange(volatile atomic_ulong *object, ulong desired);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order);
+ulong __ovld atomic_exchange_explicit(volatile atomic_ulong *object, ulong desired, memory_order order, memory_scope scope);
+#endif
+
+// atomic_compare_exchange_strong() and atomic_compare_exchange_weak()
+
+bool __ovld atomic_compare_exchange_strong(volatile atomic_int *object, int *expected, int desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_uint *object, uint *expected, uint desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_int *object, int *expected, int desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_int *object, int *expected,
+                                                                                 int desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_uint *object, uint *expected, uint desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_uint *object, uint *expected,
+                                                                                 uint desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_float *object, float *expected, float desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_float *object, float *expected, float desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_float *object, float *expected,
+                                                                                 float desired, memory_order success, memory_order failure, memory_scope scope);
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+#ifdef cl_khr_fp64
+bool __ovld atomic_compare_exchange_strong(volatile atomic_double *object, double *expected, double desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_double *object, double *expected, double desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected,
+                                                                                 double desired, memory_order success, memory_order failure, memory_scope scope);
+#endif //cl_khr_fp64
+bool __ovld atomic_compare_exchange_strong(volatile atomic_long *object, long *expected, long desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_long *object, long *expected, long desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected,
+                                                                                 long desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong *object, ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong *object, ulong *expected, ulong desired);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure);
+bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected,
+                                                                                 ulong desired, memory_order success, memory_order failure, memory_scope scope);
+#endif
+
+// atomic_flag_test_and_set() and atomic_flag_clear()
+
+bool __ovld atomic_flag_test_and_set(volatile atomic_flag *object);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
+bool __ovld atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+void __ovld atomic_flag_clear(volatile atomic_flag *object);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v1.1 s6.11.12, v1.2 s6.12.12, v2.0 s6.13.12 - Miscellaneous Vector Functions
+
+/**
+ * The shuffle and shuffle2 built-in functions construct
+ * a permutation of elements from one or two input
+ * vectors respectively that are of the same type,
+ * returning a vector with the same element type as the
+ * input and length that is the same as the shuffle mask.
+ * The size of each element in the mask must match the
+ * size of each element in the result. For shuffle, only
+ * the ilogb(2m-1) least significant bits of each mask
+ * element are considered. For shuffle2, only the
+ * ilogb(2m-1)+1 least significant bits of each mask
+ * element are considered. Other bits in the mask shall
+ * be ignored.
+ * The elements of the input vectors are numbered from
+ * left to right across one or both of the vectors. For this
+ * purpose, the number of elements in a vector is given
+ * by vec_step(gentypem). The shuffle mask operand
+ * specifies, for each element of the result vector, which
+ * element of the one or two input vectors the result
+ * element gets.
+ * Examples:
+ * uint4 mask = (uint4)(3, 2,
+ * 1, 0);
+ * float4 a;
+ * float4 r = shuffle(a, mask);
+ * // r.s0123 = a.wzyx
+ * uint8 mask = (uint8)(0, 1, 2, 3,
+ * 4, 5, 6, 7);
+ * float4 a, b;
+ * float8 r = shuffle2(a, b, mask);
+ * // r.s0123 = a.xyzw
+ * // r.s4567 = b.xyzw
+ * uint4 mask;
+ * float8 a;
+ * float4 b;
+ * b = shuffle(a, mask);
+ * Examples that are not valid are:
+ * uint8 mask;
+ * short16 a;
+ * short8 b;
+ * b = shuffle(a, mask); <- not valid
+ */
+char2 __ovld __cnfn shuffle(char2 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char4 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char8 x, uchar2 mask);
+char2 __ovld __cnfn shuffle(char16 x, uchar2 mask);
+
+uchar2 __ovld __cnfn shuffle(uchar2 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar4 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar8 x, uchar2 mask);
+uchar2 __ovld __cnfn shuffle(uchar16 x, uchar2 mask);
+
+short2 __ovld __cnfn shuffle(short2 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short4 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short8 x, ushort2 mask);
+short2 __ovld __cnfn shuffle(short16 x, ushort2 mask);
+
+ushort2 __ovld __cnfn shuffle(ushort2 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort4 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort8 x, ushort2 mask);
+ushort2 __ovld __cnfn shuffle(ushort16 x, ushort2 mask);
+
+int2 __ovld __cnfn shuffle(int2 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int4 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int8 x, uint2 mask);
+int2 __ovld __cnfn shuffle(int16 x, uint2 mask);
+
+uint2 __ovld __cnfn shuffle(uint2 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint4 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint8 x, uint2 mask);
+uint2 __ovld __cnfn shuffle(uint16 x, uint2 mask);
+
+long2 __ovld __cnfn shuffle(long2 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long4 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long8 x, ulong2 mask);
+long2 __ovld __cnfn shuffle(long16 x, ulong2 mask);
+
+ulong2 __ovld __cnfn shuffle(ulong2 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong4 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong8 x, ulong2 mask);
+ulong2 __ovld __cnfn shuffle(ulong16 x, ulong2 mask);
+
+float2 __ovld __cnfn shuffle(float2 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float4 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float8 x, uint2 mask);
+float2 __ovld __cnfn shuffle(float16 x, uint2 mask);
+
+char4 __ovld __cnfn shuffle(char2 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char4 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char8 x, uchar4 mask);
+char4 __ovld __cnfn shuffle(char16 x, uchar4 mask);
+
+uchar4 __ovld __cnfn shuffle(uchar2 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar4 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar8 x, uchar4 mask);
+uchar4 __ovld __cnfn shuffle(uchar16 x, uchar4 mask);
+
+short4 __ovld __cnfn shuffle(short2 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short4 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short8 x, ushort4 mask);
+short4 __ovld __cnfn shuffle(short16 x, ushort4 mask);
+
+ushort4 __ovld __cnfn shuffle(ushort2 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort4 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort8 x, ushort4 mask);
+ushort4 __ovld __cnfn shuffle(ushort16 x, ushort4 mask);
+
+int4 __ovld __cnfn shuffle(int2 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int4 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int8 x, uint4 mask);
+int4 __ovld __cnfn shuffle(int16 x, uint4 mask);
+
+uint4 __ovld __cnfn shuffle(uint2 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint4 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint8 x, uint4 mask);
+uint4 __ovld __cnfn shuffle(uint16 x, uint4 mask);
+
+long4 __ovld __cnfn shuffle(long2 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long4 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long8 x, ulong4 mask);
+long4 __ovld __cnfn shuffle(long16 x, ulong4 mask);
+
+ulong4 __ovld __cnfn shuffle(ulong2 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong4 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong8 x, ulong4 mask);
+ulong4 __ovld __cnfn shuffle(ulong16 x, ulong4 mask);
+
+float4 __ovld __cnfn shuffle(float2 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float4 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float8 x, uint4 mask);
+float4 __ovld __cnfn shuffle(float16 x, uint4 mask);
+
+char8 __ovld __cnfn shuffle(char2 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char4 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char8 x, uchar8 mask);
+char8 __ovld __cnfn shuffle(char16 x, uchar8 mask);
+
+uchar8 __ovld __cnfn shuffle(uchar2 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar4 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar8 x, uchar8 mask);
+uchar8 __ovld __cnfn shuffle(uchar16 x, uchar8 mask);
+
+short8 __ovld __cnfn shuffle(short2 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short4 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short8 x, ushort8 mask);
+short8 __ovld __cnfn shuffle(short16 x, ushort8 mask);
+
+ushort8 __ovld __cnfn shuffle(ushort2 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort4 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort8 x, ushort8 mask);
+ushort8 __ovld __cnfn shuffle(ushort16 x, ushort8 mask);
+
+int8 __ovld __cnfn shuffle(int2 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int4 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int8 x, uint8 mask);
+int8 __ovld __cnfn shuffle(int16 x, uint8 mask);
+
+uint8 __ovld __cnfn shuffle(uint2 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint4 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint8 x, uint8 mask);
+uint8 __ovld __cnfn shuffle(uint16 x, uint8 mask);
+
+long8 __ovld __cnfn shuffle(long2 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long4 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long8 x, ulong8 mask);
+long8 __ovld __cnfn shuffle(long16 x, ulong8 mask);
+
+ulong8 __ovld __cnfn shuffle(ulong2 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong4 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong8 x, ulong8 mask);
+ulong8 __ovld __cnfn shuffle(ulong16 x, ulong8 mask);
+
+float8 __ovld __cnfn shuffle(float2 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float4 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float8 x, uint8 mask);
+float8 __ovld __cnfn shuffle(float16 x, uint8 mask);
+
+char16 __ovld __cnfn shuffle(char2 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char4 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char8 x, uchar16 mask);
+char16 __ovld __cnfn shuffle(char16 x, uchar16 mask);
+
+uchar16 __ovld __cnfn shuffle(uchar2 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar4 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar8 x, uchar16 mask);
+uchar16 __ovld __cnfn shuffle(uchar16 x, uchar16 mask);
+
+short16 __ovld __cnfn shuffle(short2 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short4 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short8 x, ushort16 mask);
+short16 __ovld __cnfn shuffle(short16 x, ushort16 mask);
+
+ushort16 __ovld __cnfn shuffle(ushort2 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort4 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort8 x, ushort16 mask);
+ushort16 __ovld __cnfn shuffle(ushort16 x, ushort16 mask);
+
+int16 __ovld __cnfn shuffle(int2 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int4 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int8 x, uint16 mask);
+int16 __ovld __cnfn shuffle(int16 x, uint16 mask);
+
+uint16 __ovld __cnfn shuffle(uint2 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint4 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint8 x, uint16 mask);
+uint16 __ovld __cnfn shuffle(uint16 x, uint16 mask);
+
+long16 __ovld __cnfn shuffle(long2 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long4 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long8 x, ulong16 mask);
+long16 __ovld __cnfn shuffle(long16 x, ulong16 mask);
+
+ulong16 __ovld __cnfn shuffle(ulong2 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong4 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong8 x, ulong16 mask);
+ulong16 __ovld __cnfn shuffle(ulong16 x, ulong16 mask);
+
+float16 __ovld __cnfn shuffle(float2 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float4 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float8 x, uint16 mask);
+float16 __ovld __cnfn shuffle(float16 x, uint16 mask);
+
+#ifdef cl_khr_fp64
+double2 __ovld __cnfn shuffle(double2 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double4 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double8 x, ulong2 mask);
+double2 __ovld __cnfn shuffle(double16 x, ulong2 mask);
+
+double4 __ovld __cnfn shuffle(double2 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double4 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double8 x, ulong4 mask);
+double4 __ovld __cnfn shuffle(double16 x, ulong4 mask);
+
+double8 __ovld __cnfn shuffle(double2 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double4 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double8 x, ulong8 mask);
+double8 __ovld __cnfn shuffle(double16 x, ulong8 mask);
+
+double16 __ovld __cnfn shuffle(double2 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double4 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double8 x, ulong16 mask);
+double16 __ovld __cnfn shuffle(double16 x, ulong16 mask);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half2 __ovld __cnfn shuffle(half2 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half4 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half8 x, ushort2 mask);
+half2 __ovld __cnfn shuffle(half16 x, ushort2 mask);
+
+half4 __ovld __cnfn shuffle(half2 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half4 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half8 x, ushort4 mask);
+half4 __ovld __cnfn shuffle(half16 x, ushort4 mask);
+
+half8 __ovld __cnfn shuffle(half2 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half4 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half8 x, ushort8 mask);
+half8 __ovld __cnfn shuffle(half16 x, ushort8 mask);
+
+half16 __ovld __cnfn shuffle(half2 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half4 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half8 x, ushort16 mask);
+half16 __ovld __cnfn shuffle(half16 x, ushort16 mask);
+#endif //cl_khr_fp16
+
+char2 __ovld __cnfn shuffle2(char2 x, char2 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char4 x, char4 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char8 x, char8 y, uchar2 mask);
+char2 __ovld __cnfn shuffle2(char16 x, char16 y, uchar2 mask);
+
+uchar2 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar2 mask);
+uchar2 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar2 mask);
+
+short2 __ovld __cnfn shuffle2(short2 x, short2 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short4 x, short4 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short8 x, short8 y, ushort2 mask);
+short2 __ovld __cnfn shuffle2(short16 x, short16 y, ushort2 mask);
+
+ushort2 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort2 mask);
+ushort2 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort2 mask);
+
+int2 __ovld __cnfn shuffle2(int2 x, int2 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int4 x, int4 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int8 x, int8 y, uint2 mask);
+int2 __ovld __cnfn shuffle2(int16 x, int16 y, uint2 mask);
+
+uint2 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint2 mask);
+uint2 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint2 mask);
+
+long2 __ovld __cnfn shuffle2(long2 x, long2 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long4 x, long4 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long8 x, long8 y, ulong2 mask);
+long2 __ovld __cnfn shuffle2(long16 x, long16 y, ulong2 mask);
+
+ulong2 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong2 mask);
+ulong2 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong2 mask);
+
+float2 __ovld __cnfn shuffle2(float2 x, float2 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float4 x, float4 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float8 x, float8 y, uint2 mask);
+float2 __ovld __cnfn shuffle2(float16 x, float16 y, uint2 mask);
+
+char4 __ovld __cnfn shuffle2(char2 x, char2 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char4 x, char4 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char8 x, char8 y, uchar4 mask);
+char4 __ovld __cnfn shuffle2(char16 x, char16 y, uchar4 mask);
+
+uchar4 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar4 mask);
+uchar4 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar4 mask);
+
+short4 __ovld __cnfn shuffle2(short2 x, short2 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short4 x, short4 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short8 x, short8 y, ushort4 mask);
+short4 __ovld __cnfn shuffle2(short16 x, short16 y, ushort4 mask);
+
+ushort4 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort4 mask);
+ushort4 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort4 mask);
+
+int4 __ovld __cnfn shuffle2(int2 x, int2 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int4 x, int4 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int8 x, int8 y, uint4 mask);
+int4 __ovld __cnfn shuffle2(int16 x, int16 y, uint4 mask);
+
+uint4 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint4 mask);
+uint4 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint4 mask);
+
+long4 __ovld __cnfn shuffle2(long2 x, long2 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long4 x, long4 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long8 x, long8 y, ulong4 mask);
+long4 __ovld __cnfn shuffle2(long16 x, long16 y, ulong4 mask);
+
+ulong4 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong4 mask);
+ulong4 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong4 mask);
+
+float4 __ovld __cnfn shuffle2(float2 x, float2 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float4 x, float4 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float8 x, float8 y, uint4 mask);
+float4 __ovld __cnfn shuffle2(float16 x, float16 y, uint4 mask);
+
+char8 __ovld __cnfn shuffle2(char2 x, char2 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char4 x, char4 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char8 x, char8 y, uchar8 mask);
+char8 __ovld __cnfn shuffle2(char16 x, char16 y, uchar8 mask);
+
+uchar8 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar8 mask);
+uchar8 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar8 mask);
+
+short8 __ovld __cnfn shuffle2(short2 x, short2 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short4 x, short4 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short8 x, short8 y, ushort8 mask);
+short8 __ovld __cnfn shuffle2(short16 x, short16 y, ushort8 mask);
+
+ushort8 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort8 mask);
+ushort8 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort8 mask);
+
+int8 __ovld __cnfn shuffle2(int2 x, int2 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int4 x, int4 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int8 x, int8 y, uint8 mask);
+int8 __ovld __cnfn shuffle2(int16 x, int16 y, uint8 mask);
+
+uint8 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint8 mask);
+uint8 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint8 mask);
+
+long8 __ovld __cnfn shuffle2(long2 x, long2 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long4 x, long4 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long8 x, long8 y, ulong8 mask);
+long8 __ovld __cnfn shuffle2(long16 x, long16 y, ulong8 mask);
+
+ulong8 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong8 mask);
+ulong8 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong8 mask);
+
+float8 __ovld __cnfn shuffle2(float2 x, float2 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float4 x, float4 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float8 x, float8 y, uint8 mask);
+float8 __ovld __cnfn shuffle2(float16 x, float16 y, uint8 mask);
+
+char16 __ovld __cnfn shuffle2(char2 x, char2 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char4 x, char4 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char8 x, char8 y, uchar16 mask);
+char16 __ovld __cnfn shuffle2(char16 x, char16 y, uchar16 mask);
+
+uchar16 __ovld __cnfn shuffle2(uchar2 x, uchar2 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar4 x, uchar4 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar8 x, uchar8 y, uchar16 mask);
+uchar16 __ovld __cnfn shuffle2(uchar16 x, uchar16 y, uchar16 mask);
+
+short16 __ovld __cnfn shuffle2(short2 x, short2 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short4 x, short4 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short8 x, short8 y, ushort16 mask);
+short16 __ovld __cnfn shuffle2(short16 x, short16 y, ushort16 mask);
+
+ushort16 __ovld __cnfn shuffle2(ushort2 x, ushort2 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort4 x, ushort4 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort8 x, ushort8 y, ushort16 mask);
+ushort16 __ovld __cnfn shuffle2(ushort16 x, ushort16 y, ushort16 mask);
+
+int16 __ovld __cnfn shuffle2(int2 x, int2 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int4 x, int4 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int8 x, int8 y, uint16 mask);
+int16 __ovld __cnfn shuffle2(int16 x, int16 y, uint16 mask);
+
+uint16 __ovld __cnfn shuffle2(uint2 x, uint2 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint4 x, uint4 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint8 x, uint8 y, uint16 mask);
+uint16 __ovld __cnfn shuffle2(uint16 x, uint16 y, uint16 mask);
+
+long16 __ovld __cnfn shuffle2(long2 x, long2 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long4 x, long4 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long8 x, long8 y, ulong16 mask);
+long16 __ovld __cnfn shuffle2(long16 x, long16 y, ulong16 mask);
+
+ulong16 __ovld __cnfn shuffle2(ulong2 x, ulong2 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong4 x, ulong4 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong8 x, ulong8 y, ulong16 mask);
+ulong16 __ovld __cnfn shuffle2(ulong16 x, ulong16 y, ulong16 mask);
+
+float16 __ovld __cnfn shuffle2(float2 x, float2 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float4 x, float4 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float8 x, float8 y, uint16 mask);
+float16 __ovld __cnfn shuffle2(float16 x, float16 y, uint16 mask);
+
+#ifdef cl_khr_fp64
+double2 __ovld __cnfn shuffle2(double2 x, double2 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double4 x, double4 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double8 x, double8 y, ulong2 mask);
+double2 __ovld __cnfn shuffle2(double16 x, double16 y, ulong2 mask);
+
+double4 __ovld __cnfn shuffle2(double2 x, double2 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double4 x, double4 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double8 x, double8 y, ulong4 mask);
+double4 __ovld __cnfn shuffle2(double16 x, double16 y, ulong4 mask);
+
+double8 __ovld __cnfn shuffle2(double2 x, double2 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double4 x, double4 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double8 x, double8 y, ulong8 mask);
+double8 __ovld __cnfn shuffle2(double16 x, double16 y, ulong8 mask);
+
+double16 __ovld __cnfn shuffle2(double2 x, double2 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double4 x, double4 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double8 x, double8 y, ulong16 mask);
+double16 __ovld __cnfn shuffle2(double16 x, double16 y, ulong16 mask);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half2 __ovld __cnfn shuffle2(half2 x, half2 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half4 x, half4 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half8 x, half8 y, ushort2 mask);
+half2 __ovld __cnfn shuffle2(half16 x, half16 y, ushort2 mask);
+
+half4 __ovld __cnfn shuffle2(half2 x, half2 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half4 x, half4 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half8 x, half8 y, ushort4 mask);
+half4 __ovld __cnfn shuffle2(half16 x, half16 y, ushort4 mask);
+
+half8 __ovld __cnfn shuffle2(half2 x, half2 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half4 x, half4 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half8 x, half8 y, ushort8 mask);
+half8 __ovld __cnfn shuffle2(half16 x, half16 y, ushort8 mask);
+
+half16 __ovld __cnfn shuffle2(half2 x, half2 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half4 x, half4 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask);
+half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
+#endif //cl_khr_fp16
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
+
+int printf(__constant const char* st, ...);
+#endif
+
+// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
+
+// These values need to match the runtime equivalent
+//
+// Addressing Mode.
+//
+#define CLK_ADDRESS_NONE                0
+#define CLK_ADDRESS_CLAMP_TO_EDGE       2
+#define CLK_ADDRESS_CLAMP               4
+#define CLK_ADDRESS_REPEAT              6
+#define CLK_ADDRESS_MIRRORED_REPEAT     8
+
+//
+// Coordination Normalization
+//
+#define CLK_NORMALIZED_COORDS_FALSE     0
+#define CLK_NORMALIZED_COORDS_TRUE      1
+
+//
+// Filtering Mode.
+//
+#define CLK_FILTER_NEAREST              0x10
+#define CLK_FILTER_LINEAR               0x20
+
+#ifdef cl_khr_gl_msaa_sharing
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
+#endif //cl_khr_gl_msaa_sharing
+
+/**
+ * Use the coordinate (coord.xy) to do an element lookup in
+ * the 2D image object specified by image.
+ *
+ * Use the coordinate (coord.x, coord.y, coord.z) to do
+ * an element lookup in the 3D image object specified
+ * by image. coord.w is ignored.
+ *
+ * Use the coordinate (coord.z) to index into the
+ * 2D image array object specified by image_array
+ * and (coord.x, coord.y) to do an element lookup in
+ * the 2D image object specified by image.
+ *
+ * Use the coordinate (x) to do an element lookup in
+ * the 1D image object specified by image.
+ *
+ * Use the coordinate (coord.y) to index into the
+ * 1D image array object specified by image_array
+ * and (coord.x) to do an element lookup in
+ * the 1D image object specified by image.
+ *
+ * Use the coordinate (cood.xy) and sample to do an
+ * element lookup in the 2D multi-sample image specified
+ * by image.
+ *
+ * Use coord.xy and sample to do an element
+ * lookup in the 2D multi-sample image layer
+ * identified by index coord.z in the 2D multi-sample
+ * image array specified by image.
+ *
+ * For mipmap images, use the mip-level specified by
+ * the Level-of-Detail (lod) or use gradients for LOD
+ * computation.
+ *
+ * read_imagef returns floating-point values in the
+ * range [0.0 ... 1.0] for image objects created with
+ * image_channel_data_type set to one of the predefined
+ * packed formats or CL_UNORM_INT8, or
+ * CL_UNORM_INT16.
+ *
+ * read_imagef returns floating-point values in the
+ * range [-1.0 ... 1.0] for image objects created with
+ * image_channel_data_type set to CL_SNORM_INT8,
+ * or CL_SNORM_INT16.
+ *
+ * read_imagef returns floating-point values for image
+ * objects created with image_channel_data_type set to
+ * CL_HALF_FLOAT or CL_FLOAT.
+ *
+ * read_imagei and read_imageui return
+ * unnormalized signed integer and unsigned integer
+ * values respectively. Each channel will be stored in a
+ * 32-bit integer.
+ *
+ * read_imagei can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_SIGNED_INT8,
+ * CL_SIGNED_INT16 and
+ * CL_SIGNED_INT32.
+ * If the image_channel_data_type is not one of the
+ * above values, the values returned by read_imagei
+ * are undefined.
+ *
+ * read_imageui can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_UNSIGNED_INT8,
+ * CL_UNSIGNED_INT16 and
+ * CL_UNSIGNED_INT32.
+ * If the image_channel_data_type is not one of the
+ * above values, the values returned by read_imageui
+ * are undefined.
+ *
+ * The read_image{i|ui} calls support a nearest filter
+ * only. The filter_mode specified in sampler
+ * must be set to CLK_FILTER_NEAREST; otherwise
+ * the values returned are undefined.
+ 
+ * The read_image{f|i|ui} calls that take
+ * integer coordinates must use a sampler with
+ * normalized coordinates set to
+ * CLK_NORMALIZED_COORDS_FALSE and
+ * addressing mode set to
+ * CLK_ADDRESS_CLAMP_TO_EDGE,
+ * CLK_ADDRESS_CLAMP or CLK_ADDRESS_NONE;
+ * otherwise the values returned are undefined.
+ *
+ * Values returned by read_imagef for image objects
+ * with image_channel_data_type values not specified
+ * in the description above are undefined.
+ */
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, int2 coord);
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord);
+
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, int4 coord);
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord);
+
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, int coord);
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord);
+
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord);
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, int2 coord);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord);
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, int4 coord);
+#endif //cl_khr_depth_images
+
+#if defined(cl_khr_gl_msaa_sharing)
+float4 __purefn __ovld read_imagef(read_only image2d_msaa_t image, int2 coord, int sample);
+int4 __purefn __ovld read_imagei(read_only image2d_msaa_t image, int2 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_only image2d_msaa_t image, int2 coord, int sample);
+
+float __purefn __ovld read_imagef(read_only image2d_msaa_depth_t image, int2 coord, int sample);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_msaa_t image, int4 coord, int sample);
+int4 __purefn __ovld read_imagei(read_only image2d_array_msaa_t image, int4 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_msaa_t image, int4 coord, int sample);
+
+float __purefn __ovld read_imagef(read_only image2d_array_msaa_depth_t image, int4 coord, int sample);
+#endif //cl_khr_gl_msaa_sharing
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+#endif //cl_khr_mipmap_image
+
+/**
+* Sampler-less Image Access
+*/
+
+float4 __purefn __ovld read_imagef(read_only image1d_t image, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_buffer_t image, int coord);
+int4 __purefn __ovld read_imagei(read_only image1d_buffer_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_buffer_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_only image1d_array_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image1d_array_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image1d_array_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_only image2d_array_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image2d_array_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image2d_array_t image, int4 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_only image2d_depth_t image, int2 coord);
+float __purefn __ovld read_imagef(read_only image2d_array_depth_t image, int4 coord);
+#endif //cl_khr_depth_images
+
+float4 __purefn __ovld read_imagef(read_only image3d_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_only image3d_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_only image3d_t image, int4 coord);
+
+// Image read functions returning half4 type
+#ifdef cl_khr_fp16
+half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, int coord);
+half4 __purefn __ovld read_imageh(read_only image1d_t image, sampler_t sampler, float coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, float2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, float2 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, float4 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, float4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_t image, int coord);
+half4 __purefn __ovld read_imageh(read_only image2d_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image3d_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_array_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_only image2d_array_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord);
+#endif //cl_khr_fp16
+
+// Image read functions for read_write images
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+float4 __purefn __ovld read_imagef(read_write image1d_t image, int coord);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_write image1d_buffer_t image, int coord);
+int4 __purefn __ovld read_imagei(read_write image1d_buffer_t image, int coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_buffer_t image, int coord);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, int2 coord);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, int2 coord);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, int2 coord);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image, int4 coord);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, int4 coord);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, int4 coord);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, int4 coord);
+
+#ifdef cl_khr_depth_images
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, int2 coord);
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, int4 coord);
+#endif //cl_khr_depth_images
+
+#if cl_khr_gl_msaa_sharing
+float4 __purefn __ovld read_imagef(read_write image2d_msaa_t image, int2 coord, int sample);
+int4 __purefn __ovld read_imagei(read_write image2d_msaa_t image, int2 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_write image2d_msaa_t image, int2 coord, int sample);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_msaa_t image, int4 coord, int sample);
+int4 __purefn __ovld read_imagei(read_write image2d_array_msaa_t image, int4 coord, int sample);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_msaa_t image, int4 coord, int sample);
+
+float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 coord, int sample);
+float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample);
+#endif //cl_khr_gl_msaa_sharing
+
+#ifdef cl_khr_mipmap_image
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float gradientX, float gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float2 gradientX, float2 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
+
+float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_t image, sampler_t sampler, float coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image1d_array_t image_array, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_t image, sampler_t sampler, float2 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_depth_t image, sampler_t sampler, float2 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image2d_array_t image_array, sampler_t sampler, float4 coord, float lod);
+
+float __purefn __ovld read_imagef(read_write image2d_array_depth_t image, sampler_t sampler, float4 coord, float lod);
+
+float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
+#endif //cl_khr_mipmap_image
+
+// Image read functions returning half4 type
+#ifdef cl_khr_fp16
+half4 __purefn __ovld read_imageh(read_write image1d_t image, int coord);
+half4 __purefn __ovld read_imageh(read_write image2d_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_write image3d_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_write image1d_array_t image, int2 coord);
+half4 __purefn __ovld read_imageh(read_write image2d_array_t image, int4 coord);
+half4 __purefn __ovld read_imageh(read_write image1d_buffer_t image, int coord);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y) in the 2D image object specified by image.
+ * (coord.x, coord.y) are considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1, and 0
+ * ... image height - 1.
+
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y) in the 2D image object specified by index
+ * (coord.z) of the 2D image array object image_array.
+ * (coord.x, coord.y) are considered to be unnormalized
+ * coordinates and must be in the range 0 ... image width
+ * - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord) in the 1D image (buffer) object specified by image.
+ * coord is considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord.x) in the 1D image object specified by index
+ * (coord.y) of the 1D image array object image_array.
+ * x is considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1.
+ *
+ * Write color value to location specified by coordinate
+ * (coord.x, coord.y, coord.z) in the 3D image object specified by image.
+ * coord.x & coord.y are considered to be unnormalized coordinates
+ * and must be in the range 0 ... image width - 1, and 0
+ * ... image height - 1.
+ *
+ * For mipmap images, use mip-level specified by lod.
+ *
+ * Appropriate data format conversion to the specified
+ * image format is done before writing the color value.
+ *
+ * write_imagef can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the pre-defined packed formats or set to
+ * CL_SNORM_INT8, CL_UNORM_INT8,
+ * CL_SNORM_INT16, CL_UNORM_INT16,
+ * CL_HALF_FLOAT or CL_FLOAT. Appropriate data
+ * format conversion will be done to convert channel
+ * data from a floating-point value to actual data format
+ * in which the channels are stored.
+ *
+ * write_imagei can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_SIGNED_INT8,
+ * CL_SIGNED_INT16 and
+ * CL_SIGNED_INT32.
+ *
+ * write_imageui can only be used with image objects
+ * created with image_channel_data_type set to one of
+ * the following values:
+ * CL_UNSIGNED_INT8,
+ * CL_UNSIGNED_INT16 and
+ * CL_UNSIGNED_INT32.
+ *
+ * The behavior of write_imagef, write_imagei and
+ * write_imageui for image objects created with
+ * image_channel_data_type values not specified in
+ * the description above or with (x, y) coordinate
+ * values that are not in the range (0 ... image width -1,
+ * 0 ... image height - 1), respectively, is undefined.
+ */
+void __ovld write_imagef(write_only image2d_t image, int2 coord, float4 color);
+void __ovld write_imagei(write_only image2d_t image, int2 coord, int4 color);
+void __ovld write_imageui(write_only image2d_t image, int2 coord, uint4 color);
+
+void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, float4 color);
+void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int4 color);
+void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_t image, int coord, float4 color);
+void __ovld write_imagei(write_only image1d_t image, int coord, int4 color);
+void __ovld write_imageui(write_only image1d_t image, int coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_buffer_t image, int coord, float4 color);
+void __ovld write_imagei(write_only image1d_buffer_t image, int coord, int4 color);
+void __ovld write_imageui(write_only image1d_buffer_t image, int coord, uint4 color);
+
+void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, float4 color);
+void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color);
+void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color);
+
+void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color);
+void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color);
+void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color);
+
+#ifdef cl_khr_depth_images
+void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color);
+void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, float color);
+#endif //cl_khr_depth_images
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
+void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
+void __ovld write_imageui(write_only image1d_t image, int coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_t image, int2 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image2d_t image, int2 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image2d_t image, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, int lod, uint4 color);
+
+void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
+void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);
+
+void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
+void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
+void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
+#endif //cl_khr_mipmap_image
+
+// Image write functions for half4 type
+#ifdef cl_khr_fp16
+void __ovld write_imageh(write_only image1d_t image, int coord, half4 color);
+void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color);
+void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color);
+void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color);
+void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color);
+void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color);
+#endif //cl_khr_fp16
+
+// Image write functions for read_write images
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void __ovld write_imagef(read_write image2d_t image, int2 coord, float4 color);
+void __ovld write_imagei(read_write image2d_t image, int2 coord, int4 color);
+void __ovld write_imageui(read_write image2d_t image, int2 coord, uint4 color);
+
+void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, float4 color);
+void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int4 color);
+void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_t image, int coord, float4 color);
+void __ovld write_imagei(read_write image1d_t image, int coord, int4 color);
+void __ovld write_imageui(read_write image1d_t image, int coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_buffer_t image, int coord, float4 color);
+void __ovld write_imagei(read_write image1d_buffer_t image, int coord, int4 color);
+void __ovld write_imageui(read_write image1d_buffer_t image, int coord, uint4 color);
+
+void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, float4 color);
+void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color);
+void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color);
+
+void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color);
+void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color);
+void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color);
+
+#ifdef cl_khr_depth_images
+void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color);
+void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color);
+#endif //cl_khr_depth_images
+
+#ifdef cl_khr_mipmap_image
+void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
+void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
+void __ovld write_imageui(read_write image1d_t image, int coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_t image, int2 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image2d_t image, int2 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image2d_t image, int2 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_array_t image_array, int4 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image2d_array_t image_array, int4 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, int lod, uint4 color);
+
+void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color);
+void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color);
+
+void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
+void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
+void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
+#endif //cl_khr_mipmap_image
+
+// Image write functions for half4 type
+#ifdef cl_khr_fp16
+void __ovld write_imageh(read_write image1d_t image, int coord, half4 color);
+void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color);
+void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color);
+void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color);
+void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);
+void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);
+#endif //cl_khr_fp16
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// Note: In OpenCL v1.0/1.1/1.2, image argument of image query builtin functions does not have
+// access qualifier, which by default assume read_only access qualifier. Image query builtin
+// functions with write_only image argument should also be declared.
+
+/**
+ * Return the image width in pixels.
+ *
+  */
+int __ovld __cnfn get_image_width(read_only image1d_t image);
+int __ovld __cnfn get_image_width(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_width(read_only image2d_t image);
+int __ovld __cnfn get_image_width(read_only image3d_t image);
+int __ovld __cnfn get_image_width(read_only image1d_array_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_width(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_width(write_only image1d_t image);
+int __ovld __cnfn get_image_width(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_width(write_only image2d_t image);
+int __ovld __cnfn get_image_width(write_only image3d_t image);
+int __ovld __cnfn get_image_width(write_only image1d_array_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_width(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_width(read_write image1d_t image);
+int __ovld __cnfn get_image_width(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_width(read_write image2d_t image);
+int __ovld __cnfn get_image_width(read_write image3d_t image);
+int __ovld __cnfn get_image_width(read_write image1d_array_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_width(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_width(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_width(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_width(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image height in pixels.
+ */
+int __ovld __cnfn get_image_height(read_only image2d_t image);
+int __ovld __cnfn get_image_height(read_only image3d_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_height(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_height(write_only image2d_t image);
+int __ovld __cnfn get_image_height(write_only image3d_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_height(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_height(read_write image2d_t image);
+int __ovld __cnfn get_image_height(read_write image3d_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_height(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_height(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_height(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image depth in pixels.
+ */
+int __ovld __cnfn get_image_depth(read_only image3d_t image);
+
+int __ovld __cnfn get_image_depth(write_only image3d_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_depth(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL Extension v2.0 s9.18 - Mipmaps
+#ifdef cl_khr_mipmap_image
+/**
+ * Return the image miplevels.
+ */
+
+int __ovld get_image_num_mip_levels(read_only image1d_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_t image);
+int __ovld get_image_num_mip_levels(read_only image3d_t image);
+
+int __ovld get_image_num_mip_levels(write_only image1d_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_t image);
+int __ovld get_image_num_mip_levels(write_only image3d_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_mip_levels(read_write image1d_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_t image);
+int __ovld get_image_num_mip_levels(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int __ovld get_image_num_mip_levels(read_only image1d_array_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_array_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(read_only image2d_depth_t image);
+
+int __ovld get_image_num_mip_levels(write_only image1d_array_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_array_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(write_only image2d_depth_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_mip_levels(read_write image1d_array_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_array_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image);
+int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#endif //cl_khr_mipmap_image
+
+/**
+ * Return the channel data type. Valid values are:
+ * CLK_SNORM_INT8
+ * CLK_SNORM_INT16
+ * CLK_UNORM_INT8
+ * CLK_UNORM_INT16
+ * CLK_UNORM_SHORT_565
+ * CLK_UNORM_SHORT_555
+ * CLK_UNORM_SHORT_101010
+ * CLK_SIGNED_INT8
+ * CLK_SIGNED_INT16
+ * CLK_SIGNED_INT32
+ * CLK_UNSIGNED_INT8
+ * CLK_UNSIGNED_INT16
+ * CLK_UNSIGNED_INT32
+ * CLK_HALF_FLOAT
+ * CLK_FLOAT
+ */
+
+//
+// Channel Datatype.
+//
+#define CLK_SNORM_INT8        0x10D0
+#define CLK_SNORM_INT16       0x10D1
+#define CLK_UNORM_INT8        0x10D2
+#define CLK_UNORM_INT16       0x10D3
+#define CLK_UNORM_SHORT_565   0x10D4
+#define CLK_UNORM_SHORT_555   0x10D5
+#define CLK_UNORM_INT_101010  0x10D6
+#define CLK_SIGNED_INT8       0x10D7
+#define CLK_SIGNED_INT16      0x10D8
+#define CLK_SIGNED_INT32      0x10D9
+#define CLK_UNSIGNED_INT8     0x10DA
+#define CLK_UNSIGNED_INT16    0x10DB
+#define CLK_UNSIGNED_INT32    0x10DC
+#define CLK_HALF_FLOAT        0x10DD
+#define CLK_FLOAT             0x10DE
+#define CLK_UNORM_INT24       0x10DF
+
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image3d_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image channel order. Valid values are:
+ * CLK_A
+ * CLK_R
+ * CLK_Rx
+ * CLK_RG
+ * CLK_RGx
+ * CLK_RA
+ * CLK_RGB
+ * CLK_RGBx
+ * CLK_RGBA
+ * CLK_ARGB
+ * CLK_BGRA
+ * CLK_INTENSITY
+ * CLK_LUMINANCE
+ */
+// Channel order, numbering must be aligned with cl_channel_order in cl.h
+//
+#define CLK_R         0x10B0
+#define CLK_A         0x10B1
+#define CLK_RG        0x10B2
+#define CLK_RA        0x10B3
+#define CLK_RGB       0x10B4
+#define CLK_RGBA      0x10B5
+#define CLK_BGRA      0x10B6
+#define CLK_ARGB      0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx                0x10BA
+#define CLK_RGx               0x10BB
+#define CLK_RGBx              0x10BC
+#define CLK_DEPTH             0x10BD
+#define CLK_DEPTH_STENCIL     0x10BE
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define CLK_sRGB              0x10BF
+#define CLK_sRGBA             0x10C1
+#define CLK_sRGBx             0x10C0
+#define CLK_sBGRA             0x10C2
+#define CLK_ABGR              0x10C3
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int __ovld __cnfn get_image_channel_order(read_only image1d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image3d_t image);
+int __ovld __cnfn get_image_channel_order(read_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(read_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int __ovld __cnfn get_image_channel_order(write_only image1d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image3d_t image);
+int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(write_only image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __cnfn get_image_channel_order(read_write image1d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image3d_t image);
+int __ovld __cnfn get_image_channel_order(read_write image1d_array_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int __ovld __cnfn get_image_channel_order(read_write image2d_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_depth_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_t image);
+int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the 2D image width and height as an int2
+ * type. The width is returned in the x component, and
+ * the height in the y component.
+ */
+int2 __ovld __cnfn get_image_dim(read_only image2d_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+int2 __ovld __cnfn get_image_dim(write_only image2d_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int2 __ovld __cnfn get_image_dim(read_write image2d_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_t image);
+#ifdef cl_khr_depth_images
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_depth_t image);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_depth_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_t image);
+int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the 3D image width, height, and depth as an
+ * int4 type. The width is returned in the x
+ * component, height in the y component, depth in the z
+ * component and the w component is 0.
+ */
+int4 __ovld __cnfn get_image_dim(read_only image3d_t image);
+int4 __ovld __cnfn get_image_dim(write_only image3d_t image);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int4 __ovld __cnfn get_image_dim(read_write image3d_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+ * Return the image array size.
+ */
+
+size_t __ovld __cnfn get_image_array_size(read_only image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_only image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+
+size_t __ovld __cnfn get_image_array_size(write_only image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+size_t __ovld __cnfn get_image_array_size(read_write image1d_array_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_t image_array);
+#ifdef cl_khr_depth_images
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_depth_t image_array);
+#endif //cl_khr_depth_images
+#if defined(cl_khr_gl_msaa_sharing)
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_t image_array);
+size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t image_array);
+#endif //cl_khr_gl_msaa_sharing
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+/**
+* Return the number of samples associated with image
+*/
+#if defined(cl_khr_gl_msaa_sharing)
+int __ovld get_image_num_samples(read_only image2d_msaa_t image);
+int __ovld get_image_num_samples(read_only image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_t image);
+int __ovld get_image_num_samples(read_only image2d_array_msaa_depth_t image);
+
+int __ovld get_image_num_samples(write_only image2d_msaa_t image);
+int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_t image);
+int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld get_image_num_samples(read_write image2d_msaa_t image);
+int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_t image);
+int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif
+
+// OpenCL v2.0 s6.13.15 - Work-group Functions
+
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+int __ovld __conv work_group_all(int predicate);
+int __ovld __conv work_group_any(int predicate);
+
+#ifdef cl_khr_fp16
+half __ovld __conv work_group_broadcast(half a, size_t local_id);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y);
+half __ovld __conv work_group_broadcast(half a, size_t x, size_t y, size_t z);
+#endif
+int __ovld __conv work_group_broadcast(int a, size_t local_id);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y);
+int __ovld __conv work_group_broadcast(int a, size_t x, size_t y, size_t z);
+uint __ovld __conv work_group_broadcast(uint a, size_t local_id);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y);
+uint __ovld __conv work_group_broadcast(uint a, size_t x, size_t y, size_t z);
+long __ovld __conv work_group_broadcast(long a, size_t local_id);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y);
+long __ovld __conv work_group_broadcast(long a, size_t x, size_t y, size_t z);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t local_id);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y);
+ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z);
+float __ovld __conv work_group_broadcast(float a, size_t local_id);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y);
+float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z);
+#ifdef cl_khr_fp64
+double __ovld __conv work_group_broadcast(double a, size_t local_id);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y);
+double __ovld __conv work_group_broadcast(double a, size_t x, size_t y, size_t z);
+#endif //cl_khr_fp64
+
+#ifdef cl_khr_fp16
+half __ovld __conv work_group_reduce_add(half x);
+half __ovld __conv work_group_reduce_min(half x);
+half __ovld __conv work_group_reduce_max(half x);
+half __ovld __conv work_group_scan_exclusive_add(half x);
+half __ovld __conv work_group_scan_exclusive_min(half x);
+half __ovld __conv work_group_scan_exclusive_max(half x);
+half __ovld __conv work_group_scan_inclusive_add(half x);
+half __ovld __conv work_group_scan_inclusive_min(half x);
+half __ovld __conv work_group_scan_inclusive_max(half x);
+#endif
+int __ovld __conv work_group_reduce_add(int x);
+int __ovld __conv work_group_reduce_min(int x);
+int __ovld __conv work_group_reduce_max(int x);
+int __ovld __conv work_group_scan_exclusive_add(int x);
+int __ovld __conv work_group_scan_exclusive_min(int x);
+int __ovld __conv work_group_scan_exclusive_max(int x);
+int __ovld __conv work_group_scan_inclusive_add(int x);
+int __ovld __conv work_group_scan_inclusive_min(int x);
+int __ovld __conv work_group_scan_inclusive_max(int x);
+uint __ovld __conv work_group_reduce_add(uint x);
+uint __ovld __conv work_group_reduce_min(uint x);
+uint __ovld __conv work_group_reduce_max(uint x);
+uint __ovld __conv work_group_scan_exclusive_add(uint x);
+uint __ovld __conv work_group_scan_exclusive_min(uint x);
+uint __ovld __conv work_group_scan_exclusive_max(uint x);
+uint __ovld __conv work_group_scan_inclusive_add(uint x);
+uint __ovld __conv work_group_scan_inclusive_min(uint x);
+uint __ovld __conv work_group_scan_inclusive_max(uint x);
+long __ovld __conv work_group_reduce_add(long x);
+long __ovld __conv work_group_reduce_min(long x);
+long __ovld __conv work_group_reduce_max(long x);
+long __ovld __conv work_group_scan_exclusive_add(long x);
+long __ovld __conv work_group_scan_exclusive_min(long x);
+long __ovld __conv work_group_scan_exclusive_max(long x);
+long __ovld __conv work_group_scan_inclusive_add(long x);
+long __ovld __conv work_group_scan_inclusive_min(long x);
+long __ovld __conv work_group_scan_inclusive_max(long x);
+ulong __ovld __conv work_group_reduce_add(ulong x);
+ulong __ovld __conv work_group_reduce_min(ulong x);
+ulong __ovld __conv work_group_reduce_max(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_exclusive_max(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_add(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_min(ulong x);
+ulong __ovld __conv work_group_scan_inclusive_max(ulong x);
+float __ovld __conv work_group_reduce_add(float x);
+float __ovld __conv work_group_reduce_min(float x);
+float __ovld __conv work_group_reduce_max(float x);
+float __ovld __conv work_group_scan_exclusive_add(float x);
+float __ovld __conv work_group_scan_exclusive_min(float x);
+float __ovld __conv work_group_scan_exclusive_max(float x);
+float __ovld __conv work_group_scan_inclusive_add(float x);
+float __ovld __conv work_group_scan_inclusive_min(float x);
+float __ovld __conv work_group_scan_inclusive_max(float x);
+#ifdef cl_khr_fp64
+double __ovld __conv work_group_reduce_add(double x);
+double __ovld __conv work_group_reduce_min(double x);
+double __ovld __conv work_group_reduce_max(double x);
+double __ovld __conv work_group_scan_exclusive_add(double x);
+double __ovld __conv work_group_scan_exclusive_min(double x);
+double __ovld __conv work_group_scan_exclusive_max(double x);
+double __ovld __conv work_group_scan_inclusive_add(double x);
+double __ovld __conv work_group_scan_inclusive_min(double x);
+double __ovld __conv work_group_scan_inclusive_max(double x);
+#endif //cl_khr_fp64
+
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL v2.0 s6.13.16 - Pipe Functions
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#define PIPE_RESERVE_ID_VALID_BIT (1U << 30)
+#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
+bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+
+// OpenCL v2.0 s6.13.17 - Enqueue Kernels
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+#define CLK_SUCCESS                                 0
+#define CLK_ENQUEUE_FAILURE                         -101
+#define CLK_INVALID_QUEUE                           -102
+#define CLK_INVALID_NDRANGE                         -160
+#define CLK_INVALID_EVENT_WAIT_LIST                 -57
+#define CLK_DEVICE_QUEUE_FULL                       -161
+#define CLK_INVALID_ARG_SIZE                        -51
+#define CLK_EVENT_ALLOCATION_FAILURE                -100
+#define CLK_OUT_OF_RESOURCES                        -5
+
+#define CLK_NULL_QUEUE                              0
+#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
+
+// execution model related definitions
+#define CLK_ENQUEUE_FLAGS_NO_WAIT                   0x0
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL               0x1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP           0x2
+
+typedef int kernel_enqueue_flags_t;
+typedef int clk_profiling_info;
+
+// Profiling info name (see capture_event_profiling_info)
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
+
+#define MAX_WORK_DIM        3
+
+// ToDo: Remove definition of ndrange_t in Clang as an opaque type and add back
+// the following ndrange_t definition.
+#if 0
+typedef struct {
+    unsigned int workDimension;
+    size_t globalWorkOffset[MAX_WORK_DIM];
+    size_t globalWorkSize[MAX_WORK_DIM];
+    size_t localWorkSize[MAX_WORK_DIM];
+} ndrange_t;
+#endif
+
+ndrange_t __ovld ndrange_1D(size_t);
+ndrange_t __ovld ndrange_1D(size_t, size_t);
+ndrange_t __ovld ndrange_1D(size_t, size_t, size_t);
+
+ndrange_t __ovld ndrange_2D(const size_t[2]);
+ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2]);
+ndrange_t __ovld ndrange_2D(const size_t[2], const size_t[2], const size_t[2]);
+
+ndrange_t __ovld ndrange_3D(const size_t[3]);
+ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3]);
+ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3], const size_t[3]);
+
+int __ovld enqueue_marker(queue_t, uint, const __private clk_event_t*, __private clk_event_t*);
+
+void __ovld retain_event(clk_event_t);
+
+void __ovld release_event(clk_event_t);
+
+clk_event_t __ovld create_user_event(void);
+
+void __ovld set_user_event_status(clk_event_t e, int state);
+
+bool __ovld is_valid_event (clk_event_t event);
+
+void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
+
+queue_t __ovld get_default_queue(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+// OpenCL Extension v2.0 s9.17 - Sub-groups
+
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+// Shared Sub Group Functions
+uint    __ovld get_sub_group_size(void);
+uint    __ovld get_max_sub_group_size(void);
+uint    __ovld get_num_sub_groups(void);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+uint    __ovld get_enqueued_num_sub_groups(void);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+uint    __ovld get_sub_group_id(void);
+uint    __ovld get_sub_group_local_id(void);
+
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+
+int     __ovld __conv sub_group_all(int predicate);
+int     __ovld __conv sub_group_any(int predicate);
+
+int     __ovld __conv sub_group_broadcast(int   x, uint sub_group_local_id);
+uint    __ovld __conv sub_group_broadcast(uint  x, uint sub_group_local_id);
+long    __ovld __conv sub_group_broadcast(long  x, uint sub_group_local_id);
+ulong   __ovld __conv sub_group_broadcast(ulong x, uint sub_group_local_id);
+float   __ovld __conv sub_group_broadcast(float x, uint sub_group_local_id);
+
+int     __ovld __conv sub_group_reduce_add(int   x);
+uint    __ovld __conv sub_group_reduce_add(uint  x);
+long    __ovld __conv sub_group_reduce_add(long  x);
+ulong   __ovld __conv sub_group_reduce_add(ulong x);
+float   __ovld __conv sub_group_reduce_add(float x);
+int     __ovld __conv sub_group_reduce_min(int   x);
+uint    __ovld __conv sub_group_reduce_min(uint  x);
+long    __ovld __conv sub_group_reduce_min(long  x);
+ulong   __ovld __conv sub_group_reduce_min(ulong x);
+float   __ovld __conv sub_group_reduce_min(float x);
+int     __ovld __conv sub_group_reduce_max(int   x);
+uint    __ovld __conv sub_group_reduce_max(uint  x);
+long    __ovld __conv sub_group_reduce_max(long  x);
+ulong   __ovld __conv sub_group_reduce_max(ulong x);
+float   __ovld __conv sub_group_reduce_max(float x);
+
+int     __ovld __conv sub_group_scan_exclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_add(float x);
+int     __ovld __conv sub_group_scan_exclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_min(float x);
+int     __ovld __conv sub_group_scan_exclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_exclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_exclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_exclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_exclusive_max(float x);
+
+int     __ovld __conv sub_group_scan_inclusive_add(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_add(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_add(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_add(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_add(float x);
+int     __ovld __conv sub_group_scan_inclusive_min(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_min(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_min(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_min(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_min(float x);
+int     __ovld __conv sub_group_scan_inclusive_max(int   x);
+uint    __ovld __conv sub_group_scan_inclusive_max(uint  x);
+long    __ovld __conv sub_group_scan_inclusive_max(long  x);
+ulong   __ovld __conv sub_group_scan_inclusive_max(ulong x);
+float   __ovld __conv sub_group_scan_inclusive_max(float x);
+
+#ifdef cl_khr_fp16
+half    __ovld __conv sub_group_broadcast(half x, uint sub_group_local_id);
+half    __ovld __conv sub_group_reduce_add(half x);
+half    __ovld __conv sub_group_reduce_min(half x);
+half    __ovld __conv sub_group_reduce_max(half x);
+half    __ovld __conv sub_group_scan_exclusive_add(half x);
+half    __ovld __conv sub_group_scan_exclusive_min(half x);
+half    __ovld __conv sub_group_scan_exclusive_max(half x);
+half    __ovld __conv sub_group_scan_inclusive_add(half x);
+half    __ovld __conv sub_group_scan_inclusive_min(half x);
+half    __ovld __conv sub_group_scan_inclusive_max(half x);
+#endif //cl_khr_fp16
+
+#ifdef cl_khr_fp64
+double  __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id);
+double  __ovld __conv sub_group_reduce_add(double x);
+double  __ovld __conv sub_group_reduce_min(double x);
+double  __ovld __conv sub_group_reduce_max(double x);
+double  __ovld __conv sub_group_scan_exclusive_add(double x);
+double  __ovld __conv sub_group_scan_exclusive_min(double x);
+double  __ovld __conv sub_group_scan_exclusive_max(double x);
+double  __ovld __conv sub_group_scan_inclusive_add(double x);
+double  __ovld __conv sub_group_scan_inclusive_min(double x);
+double  __ovld __conv sub_group_scan_inclusive_max(double x);
+#endif //cl_khr_fp64
+
+#endif //cl_khr_subgroups cl_intel_subgroups
+
+#ifdef cl_amd_media_ops
+uint __ovld amd_bitalign(uint a, uint b, uint c);
+uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bitalign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bitalign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bitalign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bitalign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_bytealign(uint a, uint b, uint c);
+uint2 __ovld amd_bytealign(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_bytealign(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_bytealign(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_bytealign(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_bytealign(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_lerp(uint a, uint b, uint c);
+uint2 __ovld amd_lerp(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_lerp(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_lerp(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_lerp(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_lerp(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_pack(float4 v);
+
+uint __ovld amd_sad4(uint4 x, uint4 y, uint z);
+
+uint __ovld amd_sadhi(uint a, uint b, uint c);
+uint2 __ovld amd_sadhi(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sadhi(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sadhi(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sadhi(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sadhi(uint16 a, uint16 b, uint16 c);
+
+uint __ovld amd_sad(uint a, uint b, uint c);
+uint2 __ovld amd_sad(uint2 a, uint2 b, uint2 c);
+uint3 __ovld amd_sad(uint3 a, uint3 b, uint3 c);
+uint4 __ovld amd_sad(uint4 a, uint4 b, uint4 c);
+uint8 __ovld amd_sad(uint8 a, uint8 b, uint8 c);
+uint16 __ovld amd_sad(uint16 a, uint16 b, uint16 c);
+
+float __ovld amd_unpack0(uint a);
+float2 __ovld amd_unpack0(uint2 a);
+float3 __ovld amd_unpack0(uint3 a);
+float4 __ovld amd_unpack0(uint4 a);
+float8 __ovld amd_unpack0(uint8 a);
+float16 __ovld amd_unpack0(uint16 a);
+
+float __ovld amd_unpack1(uint a);
+float2 __ovld amd_unpack1(uint2 a);
+float3 __ovld amd_unpack1(uint3 a);
+float4 __ovld amd_unpack1(uint4 a);
+float8 __ovld amd_unpack1(uint8 a);
+float16 __ovld amd_unpack1(uint16 a);
+
+float __ovld amd_unpack2(uint a);
+float2 __ovld amd_unpack2(uint2 a);
+float3 __ovld amd_unpack2(uint3 a);
+float4 __ovld amd_unpack2(uint4 a);
+float8 __ovld amd_unpack2(uint8 a);
+float16 __ovld amd_unpack2(uint16 a);
+
+float __ovld amd_unpack3(uint a);
+float2 __ovld amd_unpack3(uint2 a);
+float3 __ovld amd_unpack3(uint3 a);
+float4 __ovld amd_unpack3(uint4 a);
+float8 __ovld amd_unpack3(uint8 a);
+float16 __ovld amd_unpack3(uint16 a);
+#endif // cl_amd_media_ops
+
+#ifdef cl_amd_media_ops2
+int __ovld amd_bfe(int src0, uint src1, uint src2);
+int2 __ovld amd_bfe(int2 src0, uint2 src1, uint2 src2);
+int3 __ovld amd_bfe(int3 src0, uint3 src1, uint3 src2);
+int4 __ovld amd_bfe(int4 src0, uint4 src1, uint4 src2);
+int8 __ovld amd_bfe(int8 src0, uint8 src1, uint8 src2);
+int16 __ovld amd_bfe(int16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfe(uint src0, uint src1, uint src2);
+uint2 __ovld amd_bfe(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_bfe(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_bfe(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_bfe(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_bfe(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_bfm(uint src0, uint src1);
+uint2 __ovld amd_bfm(uint2 src0, uint2 src1);
+uint3 __ovld amd_bfm(uint3 src0, uint3 src1);
+uint4 __ovld amd_bfm(uint4 src0, uint4 src1);
+uint8 __ovld amd_bfm(uint8 src0, uint8 src1);
+uint16 __ovld amd_bfm(uint16 src0, uint16 src1);
+
+float __ovld amd_max3(float src0, float src1, float src2);
+float2 __ovld amd_max3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_max3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_max3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_max3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_max3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_max3(int src0, int src1, int src2);
+int2 __ovld amd_max3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_max3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_max3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_max3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_max3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_max3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_max3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_max3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_max3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_max3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_max3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_median3(float src0, float src1, float src2);
+float2 __ovld amd_median3(float2 src0, float2 src1, float2 src2);
+float3 __ovld amd_median3(float3 src0, float3 src1, float3 src2);
+float4 __ovld amd_median3(float4 src0, float4 src1, float4 src2);
+float8 __ovld amd_median3(float8 src0, float8 src1, float8 src2);
+float16 __ovld amd_median3(float16 src0, float16 src1, float16 src2);
+
+int __ovld amd_median3(int src0, int src1, int src2);
+int2 __ovld amd_median3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_median3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_median3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_median3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_median3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_median3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_median3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_median3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_median3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_median3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_median3(uint16 src0, uint16 src1, uint16 src2);
+
+float __ovld amd_min3(float src0, float src1, float src);
+float2 __ovld amd_min3(float2 src0, float2 src1, float2 src);
+float3 __ovld amd_min3(float3 src0, float3 src1, float3 src);
+float4 __ovld amd_min3(float4 src0, float4 src1, float4 src);
+float8 __ovld amd_min3(float8 src0, float8 src1, float8 src);
+float16 __ovld amd_min3(float16 src0, float16 src1, float16 src);
+
+int __ovld amd_min3(int src0, int src1, int src2);
+int2 __ovld amd_min3(int2 src0, int2 src1, int2 src2);
+int3 __ovld amd_min3(int3 src0, int3 src1, int3 src2);
+int4 __ovld amd_min3(int4 src0, int4 src1, int4 src2);
+int8 __ovld amd_min3(int8 src0, int8 src1, int8 src2);
+int16 __ovld amd_min3(int16 src0, int16 src1, int16 src2);
+
+uint __ovld amd_min3(uint src0, uint src1, uint src2);
+uint2 __ovld amd_min3(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_min3(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_min3(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_min3(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_min3(uint16 src0, uint16 src1, uint16 src2);
+
+ulong __ovld amd_mqsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_mqsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_mqsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_mqsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_mqsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_mqsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+ulong __ovld amd_qsad(ulong src0, uint src1, ulong src2);
+ulong2 __ovld amd_qsad(ulong2 src0, uint2 src1, ulong2 src2);
+ulong3 __ovld amd_qsad(ulong3 src0, uint3 src1, ulong3 src2);
+ulong4 __ovld amd_qsad(ulong4 src0, uint4 src1, ulong4 src2);
+ulong8 __ovld amd_qsad(ulong8 src0, uint8 src1, ulong8 src2);
+ulong16 __ovld amd_qsad(ulong16 src0, uint16 src1, ulong16 src2);
+
+uint __ovld amd_msad(uint src0, uint src1, uint src2);
+uint2 __ovld amd_msad(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_msad(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_msad(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_msad(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_msad(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadd(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadd(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadd(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadd(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadd(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadd(uint16 src0, uint16 src1, uint16 src2);
+
+uint __ovld amd_sadw(uint src0, uint src1, uint src2);
+uint2 __ovld amd_sadw(uint2 src0, uint2 src1, uint2 src2);
+uint3 __ovld amd_sadw(uint3 src0, uint3 src1, uint3 src2);
+uint4 __ovld amd_sadw(uint4 src0, uint4 src1, uint4 src2);
+uint8 __ovld amd_sadw(uint8 src0, uint8 src1, uint8 src2);
+uint16 __ovld amd_sadw(uint16 src0, uint16 src1, uint16 src2);
+#endif // cl_amd_media_ops2
+
+// Disable any extensions we may have enabled previously.
+#pragma OPENCL EXTENSION all : disable
+
+#undef __cnfn
+#undef __ovld
+#endif //_OPENCL_H_
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/pkuintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/pkuintrin.h
new file mode 100644
index 000000000..9e5459450
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/pkuintrin.h
@@ -0,0 +1,48 @@
+/*===------------- pkuintrin.h - PKU intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __PKUINTRIN_H
+#define __PKUINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rdpkru_u32(void)
+{
+  return __builtin_ia32_rdpkru();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_wrpkru(unsigned int __val)
+{
+  return __builtin_ia32_wrpkru(__val);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/pmmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/pmmintrin.h
new file mode 100644
index 000000000..d4f6487af
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/pmmintrin.h
@@ -0,0 +1,310 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+
+/// \brief Loads data from an unaligned memory location to elements in a 128-bit
+///    vector. If the address of the data is not 16-byte aligned, the
+///    instruction may read two adjacent aligned blocks of memory to retrieve
+///    the requested data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit integer vector containing integer values.
+/// \returns A 128-bit vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the right source operand.
+/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the lower
+///    bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the upper
+///    bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
+///    vector of [4 x float] to float values stored in a 128-bit vector of
+///    [4 x float]. 
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
+}
+
+/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
+///    [4 x float] to float values stored in a 128-bit vector of [4 x float]. 
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] \n
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the alternating sums
+///    and differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Horizontally adds the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Moves and duplicates one double-precision value to double-precision
+///    values stored in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_loaddup_pd(double const * dp);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param dp
+///    A pointer to a double-precision value to be moved and duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+/// \brief Moves and duplicates the double-precision value in the lower bits of
+///    a 128-bit vector of [2 x double] to double-precision values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
+///    [127:64] and [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+/// \brief Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+/// \brief Used with the MONITOR instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which may vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which may vary by processor.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/popcntintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/popcntintrin.h
new file mode 100644
index 000000000..0b4793e58
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/popcntintrin.h
@@ -0,0 +1,98 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    A signed 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ int __DEFAULT_FN_ATTRS
+_popcnt32(int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    A signed 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_popcnt64(long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _POPCNTINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/prfchwintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/prfchwintrin.h
new file mode 100644
index 000000000..ba0285751
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/prfchwintrin.h
@@ -0,0 +1,45 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/rdseedintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/rdseedintrin.h
new file mode 100644
index 000000000..421f4ea48
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/rdseedintrin.h
@@ -0,0 +1,56 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDSEEDINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/rtmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/rtmintrin.h
new file mode 100644
index 000000000..e6a58d743
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/rtmintrin.h
@@ -0,0 +1,59 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RTMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/s390intrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/s390intrin.h
new file mode 100644
index 000000000..d51274c07
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/s390intrin.h
@@ -0,0 +1,39 @@
+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __S390INTRIN_H
+#define __S390INTRIN_H
+
+#ifndef __s390__
+#error "<s390intrin.h> is for s390 only"
+#endif
+
+#ifdef __HTM__
+#include <htmintrin.h>
+#endif
+
+#ifdef __VEC__
+#include <vecintrin.h>
+#endif
+
+#endif /* __S390INTRIN_H*/
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/shaintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/shaintrin.h
new file mode 100644
index 000000000..9b5d21800
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/shaintrin.h
@@ -0,0 +1,75 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __SHAINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/smmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/smmintrin.h
new file mode 100644
index 000000000..e48ab034f
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/smmintrin.h
@@ -0,0 +1,507 @@
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _SMMINTRIN_H
+#define _SMMINTRIN_H
+
+#include <tmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+#define _mm_round_ps(X, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
+
+#define _mm_round_ss(X, Y, M) __extension__ ({ \
+  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                 (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_round_pd(X, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
+
+#define _mm_round_sd(X, Y, M) __extension__ ({ \
+  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                  (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Packed Blending Intrinsics.  */
+#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
+  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
+                                   (__v2df)(__m128d)(V2), \
+                                   (((M) & 0x01) ? 2 : 0), \
+                                   (((M) & 0x02) ? 3 : 1)); })
+
+#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
+                                  (((M) & 0x01) ? 4 : 0), \
+                                  (((M) & 0x02) ? 5 : 1), \
+                                  (((M) & 0x04) ? 6 : 2), \
+                                  (((M) & 0x08) ? 7 : 3)); })
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+                                            (__v2df)__M);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+                                           (__v4sf)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+                                               (__v16qi)__M);
+}
+
+#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
+  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
+                                   (__v8hi)(__m128i)(V2), \
+                                   (((M) & 0x01) ?  8 : 0), \
+                                   (((M) & 0x02) ?  9 : 1), \
+                                   (((M) & 0x04) ? 10 : 2), \
+                                   (((M) & 0x08) ? 11 : 3), \
+                                   (((M) & 0x10) ? 12 : 4), \
+                                   (((M) & 0x20) ? 13 : 5), \
+                                   (((M) & 0x40) ? 14 : 6), \
+                                   (((M) & 0x80) ? 15 : 7)); })
+
+/* SSE4 Dword Multiply Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions.  */
+#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                               (__v4sf)(__m128)(Y), (M)); })
+
+#define _mm_dp_pd(X, Y, M) __extension__ ({\
+  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                (__v2df)(__m128d)(Y), (M)); })
+
+/* SSE4 Streaming Load Hint Instruction.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_stream_load_si128 (__m128i const *__V)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions.  */
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+}
+
+static __inline__  __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+#define _mm_extract_ps(X, N) (__extension__                      \
+                              ({ union { int __i; float __f; } __t;  \
+                                 __v4sf __a = (__v4sf)(__m128)(X);       \
+                                 __t.__f = __a[(N) & 3];                 \
+                                 __t.__i;}))
+
+/* Miscellaneous insert and extract macros.  */
+/* Extract a single-precision float from X at index N into D.  */
+#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
+                                                    (D) = __a[N]; }))
+
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+   an index suitable for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+
+/* Extract a float from X at index N into the first index of the return.  */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+
+/* Insert int into packed integer array at index.  */
+#define _mm_insert_epi8(X, I, N) (__extension__                           \
+                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                     __a[(N) & 15] = (I);                 \
+                                     (__m128i)__a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__                         \
+                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                      __a[(N) & 3] = (I);                \
+                                      (__m128i)__a;}))
+#ifdef __x86_64__
+#define _mm_insert_epi64(X, I, N) (__extension__                         \
+                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                      __a[(N) & 1] = (I);                \
+                                      (__m128i)__a;}))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index.  This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+#define _mm_extract_epi8(X, N) (__extension__                           \
+                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+                                   (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__                         \
+                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
+                                    (int)__a[(N) & 3];}))
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) (__extension__                         \
+                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
+                                    (long long)__a[(N) & 1];}))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons.  */
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi16(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi32(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi64(__m128i __V)
+{
+  /* This function always performs a signed extension, but __v16qi is a char
+     which may be signed or unsigned, so use __v16qs. */
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Packed Integer Zero-Extension.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi16(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi32(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu32_epi64(__m128i __V)
+{
+  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Pack with Unsigned Saturation.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference.  */
+#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                      (__v16qi)(__m128i)(Y), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_minpos_epu16(__m128i __V)
+{
+  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* Handle the sse4.2 definitions here. */
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+   so we'll do the same.  */
+
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+
+/* These specify the type of data that we're comparing.  */
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These specify the type of comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity of the operation.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return.  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics.  */
+#define _mm_cmpistrm(A, B, M) \
+  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                       (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistri(A, B, M) \
+  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+                                       (__v16qi)(__m128i)(B), (int)(LB), \
+                                       (int)(M))
+#define _mm_cmpestri(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+                                   (__v16qi)(__m128i)(B), (int)(LB), \
+                                   (int)(M))
+
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+#define _mm_cmpistra(A, B, M) \
+  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrc(A, B, M) \
+  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistro(A, B, M) \
+  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrs(A, B, M) \
+  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistrz(A, B, M) \
+  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                    (__v16qi)(__m128i)(B), (int)(M))
+
+#define _mm_cmpestra(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestro(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M))
+
+/* SSE4.2 Compare Packed Data -- Greater Than.  */
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+/* SSE4.2 Accumulate CRC32.  */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#ifdef __POPCNT__
+#include <popcntintrin.h>
+#endif
+
+#endif /* _SMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdalign.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdalign.h
new file mode 100644
index 000000000..3738d1284
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdalign.h
@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdarg.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdarg.h
new file mode 100644
index 000000000..a57e18364
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdarg.h
@@ -0,0 +1,52 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdatomic.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdatomic.h
new file mode 100644
index 000000000..23bb3a357
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdatomic.h
@@ -0,0 +1,190 @@
+/*===---- stdatomic.h - Standard header for atomic types and operations -----===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_STDATOMIC_H
+#define __CLANG_STDATOMIC_H
+
+/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
+ * example, already has a Clang-compatible stdatomic.h header.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
+# include_next <stdatomic.h>
+#else
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* 7.17.1 Introduction */
+
+#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#define ATOMIC_SHORT_LOCK_FREE      __GCC_ATOMIC_SHORT_LOCK_FREE
+#define ATOMIC_INT_LOCK_FREE        __GCC_ATOMIC_INT_LOCK_FREE
+#define ATOMIC_LONG_LOCK_FREE       __GCC_ATOMIC_LONG_LOCK_FREE
+#define ATOMIC_LLONG_LOCK_FREE      __GCC_ATOMIC_LLONG_LOCK_FREE
+#define ATOMIC_POINTER_LOCK_FREE    __GCC_ATOMIC_POINTER_LOCK_FREE
+
+/* 7.17.2 Initialization */
+
+#define ATOMIC_VAR_INIT(value) (value)
+#define atomic_init __c11_atomic_init
+
+/* 7.17.3 Order and consistency */
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+#define kill_dependency(y) (y)
+
+/* 7.17.4 Fences */
+
+/* These should be provided by the libc implementation. */
+void atomic_thread_fence(memory_order);
+void atomic_signal_fence(memory_order);
+
+#define atomic_thread_fence(order) __c11_atomic_thread_fence(order)
+#define atomic_signal_fence(order) __c11_atomic_signal_fence(order)
+
+/* 7.17.5 Lock-free property */
+
+#define atomic_is_lock_free(obj) __c11_atomic_is_lock_free(sizeof(*(obj)))
+
+/* 7.17.6 Atomic integer types */
+
+#ifdef __cplusplus
+typedef _Atomic(bool)               atomic_bool;
+#else
+typedef _Atomic(_Bool)              atomic_bool;
+#endif
+typedef _Atomic(char)               atomic_char;
+typedef _Atomic(signed char)        atomic_schar;
+typedef _Atomic(unsigned char)      atomic_uchar;
+typedef _Atomic(short)              atomic_short;
+typedef _Atomic(unsigned short)     atomic_ushort;
+typedef _Atomic(int)                atomic_int;
+typedef _Atomic(unsigned int)       atomic_uint;
+typedef _Atomic(long)               atomic_long;
+typedef _Atomic(unsigned long)      atomic_ulong;
+typedef _Atomic(long long)          atomic_llong;
+typedef _Atomic(unsigned long long) atomic_ullong;
+typedef _Atomic(uint_least16_t)     atomic_char16_t;
+typedef _Atomic(uint_least32_t)     atomic_char32_t;
+typedef _Atomic(wchar_t)            atomic_wchar_t;
+typedef _Atomic(int_least8_t)       atomic_int_least8_t;
+typedef _Atomic(uint_least8_t)      atomic_uint_least8_t;
+typedef _Atomic(int_least16_t)      atomic_int_least16_t;
+typedef _Atomic(uint_least16_t)     atomic_uint_least16_t;
+typedef _Atomic(int_least32_t)      atomic_int_least32_t;
+typedef _Atomic(uint_least32_t)     atomic_uint_least32_t;
+typedef _Atomic(int_least64_t)      atomic_int_least64_t;
+typedef _Atomic(uint_least64_t)     atomic_uint_least64_t;
+typedef _Atomic(int_fast8_t)        atomic_int_fast8_t;
+typedef _Atomic(uint_fast8_t)       atomic_uint_fast8_t;
+typedef _Atomic(int_fast16_t)       atomic_int_fast16_t;
+typedef _Atomic(uint_fast16_t)      atomic_uint_fast16_t;
+typedef _Atomic(int_fast32_t)       atomic_int_fast32_t;
+typedef _Atomic(uint_fast32_t)      atomic_uint_fast32_t;
+typedef _Atomic(int_fast64_t)       atomic_int_fast64_t;
+typedef _Atomic(uint_fast64_t)      atomic_uint_fast64_t;
+typedef _Atomic(intptr_t)           atomic_intptr_t;
+typedef _Atomic(uintptr_t)          atomic_uintptr_t;
+typedef _Atomic(size_t)             atomic_size_t;
+typedef _Atomic(ptrdiff_t)          atomic_ptrdiff_t;
+typedef _Atomic(intmax_t)           atomic_intmax_t;
+typedef _Atomic(uintmax_t)          atomic_uintmax_t;
+
+/* 7.17.7 Operations on atomic types */
+
+#define atomic_store(object, desired) __c11_atomic_store(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_store_explicit __c11_atomic_store
+
+#define atomic_load(object) __c11_atomic_load(object, __ATOMIC_SEQ_CST)
+#define atomic_load_explicit __c11_atomic_load
+
+#define atomic_exchange(object, desired) __c11_atomic_exchange(object, desired, __ATOMIC_SEQ_CST)
+#define atomic_exchange_explicit __c11_atomic_exchange
+
+#define atomic_compare_exchange_strong(object, expected, desired) __c11_atomic_compare_exchange_strong(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_strong_explicit __c11_atomic_compare_exchange_strong
+
+#define atomic_compare_exchange_weak(object, expected, desired) __c11_atomic_compare_exchange_weak(object, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define atomic_compare_exchange_weak_explicit __c11_atomic_compare_exchange_weak
+
+#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_add_explicit __c11_atomic_fetch_add
+
+#define atomic_fetch_sub(object, operand) __c11_atomic_fetch_sub(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub_explicit __c11_atomic_fetch_sub
+
+#define atomic_fetch_or(object, operand) __c11_atomic_fetch_or(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_or_explicit __c11_atomic_fetch_or
+
+#define atomic_fetch_xor(object, operand) __c11_atomic_fetch_xor(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_xor_explicit __c11_atomic_fetch_xor
+
+#define atomic_fetch_and(object, operand) __c11_atomic_fetch_and(object, operand, __ATOMIC_SEQ_CST)
+#define atomic_fetch_and_explicit __c11_atomic_fetch_and
+
+/* 7.17.8 Atomic flag type and operations */
+
+typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
+
+#define ATOMIC_FLAG_INIT { 0 }
+
+/* These should be provided by the libc implementation. */
+#ifdef __cplusplus
+bool atomic_flag_test_and_set(volatile atomic_flag *);
+bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#else
+_Bool atomic_flag_test_and_set(volatile atomic_flag *);
+_Bool atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#endif
+void atomic_flag_clear(volatile atomic_flag *);
+void atomic_flag_clear_explicit(volatile atomic_flag *, memory_order);
+
+#define atomic_flag_test_and_set(object) __c11_atomic_exchange(&(object)->_Value, 1, __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set_explicit(object, order) __c11_atomic_exchange(&(object)->_Value, 1, order)
+
+#define atomic_flag_clear(object) __c11_atomic_store(&(object)->_Value, 0, __ATOMIC_SEQ_CST)
+#define atomic_flag_clear_explicit(object, order) __c11_atomic_store(&(object)->_Value, 0, order)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDATOMIC_H */
+
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdbool.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdbool.h
new file mode 100644
index 000000000..0467893f3
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdbool.h
@@ -0,0 +1,44 @@
+/*===---- stdbool.h - Standard header for booleans -------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDBOOL_H
+#define __STDBOOL_H
+
+/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#ifndef __cplusplus
+#define bool _Bool
+#define true 1
+#define false 0
+#elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Define _Bool, bool, false, true as a GNU extension. */
+#define _Bool bool
+#define bool  bool
+#define false false
+#define true  true
+#endif
+
+#define __bool_true_false_are_defined 1
+
+#endif /* __STDBOOL_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stddef.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stddef.h
new file mode 100644
index 000000000..735499671
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stddef.h
@@ -0,0 +1,137 @@
+/*===---- stddef.h - Basic type definitions --------------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
+    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
+    defined(__need_NULL) || defined(__need_wint_t)
+
+#if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
+    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
+    !defined(__need_wint_t)
+/* Always define miscellaneous pieces when modules are available. */
+#if !__has_feature(modules)
+#define __STDDEF_H
+#endif
+#define __need_ptrdiff_t
+#define __need_size_t
+#define __need_wchar_t
+#define __need_NULL
+#define __need_STDDEF_H_misc
+/* __need_wint_t is intentionally not defined here. */
+#endif
+
+#if defined(__need_ptrdiff_t)
+#if !defined(_PTRDIFF_T) || __has_feature(modules)
+/* Always define ptrdiff_t when modules are available. */
+#if !__has_feature(modules)
+#define _PTRDIFF_T
+#endif
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+#endif
+#undef __need_ptrdiff_t
+#endif /* defined(__need_ptrdiff_t) */
+
+#if defined(__need_size_t)
+#if !defined(_SIZE_T) || __has_feature(modules)
+/* Always define size_t when modules are available. */
+#if !__has_feature(modules)
+#define _SIZE_T
+#endif
+typedef __SIZE_TYPE__ size_t;
+#endif
+#undef __need_size_t
+#endif /*defined(__need_size_t) */
+
+#if defined(__need_STDDEF_H_misc)
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
+     !defined(_RSIZE_T)) || __has_feature(modules)
+/* Always define rsize_t when modules are available. */
+#if !__has_feature(modules)
+#define _RSIZE_T
+#endif
+typedef __SIZE_TYPE__ rsize_t;
+#endif
+#endif /* defined(__need_STDDEF_H_misc) */
+
+#if defined(__need_wchar_t)
+#ifndef __cplusplus
+/* Always define wchar_t when modules are available. */
+#if !defined(_WCHAR_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WCHAR_T
+#if defined(_MSC_EXTENSIONS)
+#define _WCHAR_T_DEFINED
+#endif
+#endif
+typedef __WCHAR_TYPE__ wchar_t;
+#endif
+#endif
+#undef __need_wchar_t
+#endif /* defined(__need_wchar_t) */
+
+#if defined(__need_NULL)
+#undef NULL
+#ifdef __cplusplus
+#  if !defined(__MINGW32__) && !defined(_MSC_VER)
+#    define NULL __null
+#  else
+#    define NULL 0
+#  endif
+#else
+#  define NULL ((void*)0)
+#endif
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std { typedef decltype(nullptr) nullptr_t; }
+using ::std::nullptr_t;
+#endif
+#endif
+#undef __need_NULL
+#endif /* defined(__need_NULL) */
+
+#if defined(__need_STDDEF_H_misc)
+#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#include "__stddef_max_align_t.h"
+#endif
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#undef __need_STDDEF_H_misc
+#endif  /* defined(__need_STDDEF_H_misc) */
+
+/* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
+__WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
+#if defined(__need_wint_t)
+/* Always define wint_t when modules are available. */
+#if !defined(_WINT_T) || __has_feature(modules)
+#if !__has_feature(modules)
+#define _WINT_T
+#endif
+typedef __WINT_TYPE__ wint_t;
+#endif
+#undef __need_wint_t
+#endif /* __need_wint_t */
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdint.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdint.h
new file mode 100644
index 000000000..3f2fcbc57
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdint.h
@@ -0,0 +1,707 @@
+/*===---- stdint.h - Standard header for sized integer types --------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_STDINT_H
+#define __CLANG_STDINT_H
+
+/* If we're hosted, fall back to the system's stdint.h, which might have
+ * additional definitions.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdint.h>)
+
+// C99 7.18.3 Limits of other integer types
+//
+//  Footnote 219, 220: C++ implementations should define these macros only when
+//  __STDC_LIMIT_MACROS is defined before <stdint.h> is included.
+//
+//  Footnote 222: C++ implementations should define these macros only when
+//  __STDC_CONSTANT_MACROS is defined before <stdint.h> is included.
+//
+// C++11 [cstdint.syn]p2:
+//
+//  The macros defined by <cstdint> are provided unconditionally. In particular,
+//  the symbols __STDC_LIMIT_MACROS and __STDC_CONSTANT_MACROS (mentioned in
+//  footnotes 219, 220, and 222 in the C standard) play no role in C++.
+//
+// C11 removed the problematic footnotes.
+//
+// Work around this inconsistency by always defining those macros in C++ mode,
+// so that a C library implementation which follows the C99 standard can be
+// used in C++.
+# ifdef __cplusplus
+#  if !defined(__STDC_LIMIT_MACROS)
+#   define __STDC_LIMIT_MACROS
+#   define __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  endif
+#  if !defined(__STDC_CONSTANT_MACROS)
+#   define __STDC_CONSTANT_MACROS
+#   define __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  endif
+# endif
+
+# include_next <stdint.h>
+
+# ifdef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_LIMIT_MACROS
+#  undef __STDC_LIMIT_MACROS_DEFINED_BY_CLANG
+# endif
+# ifdef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+#  undef __STDC_CONSTANT_MACROS
+#  undef __STDC_CONSTANT_MACROS_DEFINED_BY_CLANG
+# endif
+
+#else
+
+/* C99 7.18.1.1 Exact-width integer types.
+ * C99 7.18.1.2 Minimum-width integer types.
+ * C99 7.18.1.3 Fastest minimum-width integer types.
+ *
+ * The standard requires that exact-width type be defined for 8-, 16-, 32-, and
+ * 64-bit types if they are implemented. Other exact width types are optional.
+ * This implementation defines an exact-width types for every integer width
+ * that is represented in the standard integer types.
+ *
+ * The standard also requires minimum-width types be defined for 8-, 16-, 32-,
+ * and 64-bit widths regardless of whether there are corresponding exact-width
+ * types.
+ *
+ * To accommodate targets that are missing types that are exactly 8, 16, 32, or
+ * 64 bits wide, this implementation takes an approach of cascading
+ * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * types. It is therefore important that the types are defined in order of
+ * descending widths.
+ *
+ * We currently assume that the minimum-width types and the fastest
+ * minimum-width types are the same. This is allowed by the standard, but is
+ * suboptimal.
+ *
+ * In violation of the standard, some targets do not implement a type that is
+ * wide enough to represent all of the required widths (8-, 16-, 32-, 64-bit).
+ * To accommodate these targets, a required minimum-width type is only
+ * defined if there exists an exact-width type of equal or greater width.
+ */
+
+#ifdef __INT64_TYPE__
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int64_t*/
+typedef __INT64_TYPE__ int64_t;
+# endif /* __int8_t_defined */
+typedef __UINT64_TYPE__ uint64_t;
+# define __int_least64_t int64_t
+# define __uint_least64_t uint64_t
+# define __int_least32_t int64_t
+# define __uint_least32_t uint64_t
+# define __int_least16_t int64_t
+# define __uint_least16_t uint64_t
+# define __int_least8_t int64_t
+# define __uint_least8_t uint64_t
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+typedef __int_least64_t int_least64_t;
+typedef __uint_least64_t uint_least64_t;
+typedef __int_least64_t int_fast64_t;
+typedef __uint_least64_t uint_fast64_t;
+#endif /* __int_least64_t */
+
+#ifdef __INT56_TYPE__
+typedef __INT56_TYPE__ int56_t;
+typedef __UINT56_TYPE__ uint56_t;
+typedef int56_t int_least56_t;
+typedef uint56_t uint_least56_t;
+typedef int56_t int_fast56_t;
+typedef uint56_t uint_fast56_t;
+# define __int_least32_t int56_t
+# define __uint_least32_t uint56_t
+# define __int_least16_t int56_t
+# define __uint_least16_t uint56_t
+# define __int_least8_t int56_t
+# define __uint_least8_t uint56_t
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+typedef __INT48_TYPE__ int48_t;
+typedef __UINT48_TYPE__ uint48_t;
+typedef int48_t int_least48_t;
+typedef uint48_t uint_least48_t;
+typedef int48_t int_fast48_t;
+typedef uint48_t uint_fast48_t;
+# define __int_least32_t int48_t
+# define __uint_least32_t uint48_t
+# define __int_least16_t int48_t
+# define __uint_least16_t uint48_t
+# define __int_least8_t int48_t
+# define __uint_least8_t uint48_t
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+typedef __INT40_TYPE__ int40_t;
+typedef __UINT40_TYPE__ uint40_t;
+typedef int40_t int_least40_t;
+typedef uint40_t uint_least40_t;
+typedef int40_t int_fast40_t;
+typedef uint40_t uint_fast40_t;
+# define __int_least32_t int40_t
+# define __uint_least32_t uint40_t
+# define __int_least16_t int40_t
+# define __uint_least16_t uint40_t
+# define __int_least8_t int40_t
+# define __uint_least8_t uint40_t
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+
+# ifndef __int8_t_defined /* glibc sys/types.h also defines int32_t*/
+typedef __INT32_TYPE__ int32_t;
+# endif /* __int8_t_defined */
+
+# ifndef __uint32_t_defined  /* more glibc compatibility */
+# define __uint32_t_defined
+typedef __UINT32_TYPE__ uint32_t;
+# endif /* __uint32_t_defined */
+
+# define __int_least32_t int32_t
+# define __uint_least32_t uint32_t
+# define __int_least16_t int32_t
+# define __uint_least16_t uint32_t
+# define __int_least8_t int32_t
+# define __uint_least8_t uint32_t
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+typedef __int_least32_t int_least32_t;
+typedef __uint_least32_t uint_least32_t;
+typedef __int_least32_t int_fast32_t;
+typedef __uint_least32_t uint_fast32_t;
+#endif /* __int_least32_t */
+
+#ifdef __INT24_TYPE__
+typedef __INT24_TYPE__ int24_t;
+typedef __UINT24_TYPE__ uint24_t;
+typedef int24_t int_least24_t;
+typedef uint24_t uint_least24_t;
+typedef int24_t int_fast24_t;
+typedef uint24_t uint_fast24_t;
+# define __int_least16_t int24_t
+# define __uint_least16_t uint24_t
+# define __int_least8_t int24_t
+# define __uint_least8_t uint24_t
+#endif /* __INT24_TYPE__ */
+
+#ifdef __INT16_TYPE__
+#ifndef __int8_t_defined /* glibc sys/types.h also defines int16_t*/
+typedef __INT16_TYPE__ int16_t;
+#endif /* __int8_t_defined */
+typedef __UINT16_TYPE__ uint16_t;
+# define __int_least16_t int16_t
+# define __uint_least16_t uint16_t
+# define __int_least8_t int16_t
+# define __uint_least8_t uint16_t
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+typedef __int_least16_t int_least16_t;
+typedef __uint_least16_t uint_least16_t;
+typedef __int_least16_t int_fast16_t;
+typedef __uint_least16_t uint_fast16_t;
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+#ifndef __int8_t_defined  /* glibc sys/types.h also defines int8_t*/
+typedef __INT8_TYPE__ int8_t;
+#endif /* __int8_t_defined */
+typedef __UINT8_TYPE__ uint8_t;
+# define __int_least8_t int8_t
+# define __uint_least8_t uint8_t
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+typedef __int_least8_t int_least8_t;
+typedef __uint_least8_t uint_least8_t;
+typedef __int_least8_t int_fast8_t;
+typedef __uint_least8_t uint_fast8_t;
+#endif /* __int_least8_t */
+
+/* prevent glibc sys/types.h from defining conflicting types */
+#ifndef __int8_t_defined
+# define __int8_t_defined
+#endif /* __int8_t_defined */
+
+/* C99 7.18.1.4 Integer types capable of holding object pointers.
+ */
+#define __stdint_join3(a,b,c) a ## b ## c
+
+#define  __intn_t(n) __stdint_join3( int, n, _t)
+#define __uintn_t(n) __stdint_join3(uint, n, _t)
+
+#ifndef _INTPTR_T
+#ifndef __intptr_t_defined
+typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+#define __intptr_t_defined
+#define _INTPTR_T
+#endif
+#endif
+
+#ifndef _UINTPTR_T
+typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+#define _UINTPTR_T
+#endif
+
+/* C99 7.18.1.5 Greatest-width integer types.
+ */
+typedef __INTMAX_TYPE__  intmax_t;
+typedef __UINTMAX_TYPE__ uintmax_t;
+
+/* C99 7.18.4 Macros for minimum-width integer constants.
+ *
+ * The standard requires that integer constant macros be defined for all the
+ * minimum-width types defined above. As 8-, 16-, 32-, and 64-bit minimum-width
+ * types are required, the corresponding integer constant macros are defined
+ * here. This implementation also defines minimum-width types for every other
+ * integer width that the target implements, so corresponding macros are
+ * defined below, too.
+ *
+ * These macros are defined using the same successive-shrinking approach as
+ * the type definitions above. It is likewise important that macros are defined
+ * in order of decending width.
+ *
+ * Note that C++ should not check __STDC_CONSTANT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#define __int_c_join(a, b) a ## b
+#define __int_c(v, suffix) __int_c_join(v, suffix)
+#define __uint_c(v, suffix) __int_c_join(v##U, suffix)
+
+
+#ifdef __INT64_TYPE__
+# ifdef __INT64_C_SUFFIX__
+#  define __int64_c_suffix __INT64_C_SUFFIX__
+#  define __int32_c_suffix __INT64_C_SUFFIX__
+#  define __int16_c_suffix __INT64_C_SUFFIX__
+#  define  __int8_c_suffix __INT64_C_SUFFIX__
+# else
+#  undef __int64_c_suffix
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT64_C_SUFFIX__ */
+#endif /* __INT64_TYPE__ */
+
+#ifdef __int_least64_t
+# ifdef __int64_c_suffix
+#  define INT64_C(v) __int_c(v, __int64_c_suffix)
+#  define UINT64_C(v) __uint_c(v, __int64_c_suffix)
+# else
+#  define INT64_C(v) v
+#  define UINT64_C(v) v ## U
+# endif /* __int64_c_suffix */
+#endif /* __int_least64_t */
+
+
+#ifdef __INT56_TYPE__
+# ifdef __INT56_C_SUFFIX__
+#  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
+#  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
+#  define __int32_c_suffix __INT56_C_SUFFIX__
+#  define __int16_c_suffix __INT56_C_SUFFIX__
+#  define __int8_c_suffix  __INT56_C_SUFFIX__
+# else
+#  define INT56_C(v) v
+#  define UINT56_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT56_C_SUFFIX__ */
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# ifdef __INT48_C_SUFFIX__
+#  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
+#  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
+#  define __int32_c_suffix __INT48_C_SUFFIX__
+#  define __int16_c_suffix __INT48_C_SUFFIX__
+#  define __int8_c_suffix  __INT48_C_SUFFIX__
+# else
+#  define INT48_C(v) v
+#  define UINT48_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT48_C_SUFFIX__ */
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# ifdef __INT40_C_SUFFIX__
+#  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
+#  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
+#  define __int32_c_suffix __INT40_C_SUFFIX__
+#  define __int16_c_suffix __INT40_C_SUFFIX__
+#  define __int8_c_suffix  __INT40_C_SUFFIX__
+# else
+#  define INT40_C(v) v
+#  define UINT40_C(v) v ## U
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT40_C_SUFFIX__ */
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# ifdef __INT32_C_SUFFIX__
+#  define __int32_c_suffix __INT32_C_SUFFIX__
+#  define __int16_c_suffix __INT32_C_SUFFIX__
+#  define __int8_c_suffix  __INT32_C_SUFFIX__
+#else
+#  undef __int32_c_suffix
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT32_C_SUFFIX__ */
+#endif /* __INT32_TYPE__ */
+
+#ifdef __int_least32_t
+# ifdef __int32_c_suffix
+#  define INT32_C(v) __int_c(v, __int32_c_suffix)
+#  define UINT32_C(v) __uint_c(v, __int32_c_suffix)
+# else
+#  define INT32_C(v) v
+#  define UINT32_C(v) v ## U
+# endif /* __int32_c_suffix */
+#endif /* __int_least32_t */
+
+
+#ifdef __INT24_TYPE__
+# ifdef __INT24_C_SUFFIX__
+#  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
+#  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
+#  define __int16_c_suffix __INT24_C_SUFFIX__
+#  define __int8_c_suffix  __INT24_C_SUFFIX__
+# else
+#  define INT24_C(v) v
+#  define UINT24_C(v) v ## U
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT24_C_SUFFIX__ */
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+# ifdef __INT16_C_SUFFIX__
+#  define __int16_c_suffix __INT16_C_SUFFIX__
+#  define __int8_c_suffix  __INT16_C_SUFFIX__
+#else
+#  undef __int16_c_suffix
+#  undef  __int8_c_suffix
+# endif /* __INT16_C_SUFFIX__ */
+#endif /* __INT16_TYPE__ */
+
+#ifdef __int_least16_t
+# ifdef __int16_c_suffix
+#  define INT16_C(v) __int_c(v, __int16_c_suffix)
+#  define UINT16_C(v) __uint_c(v, __int16_c_suffix)
+# else
+#  define INT16_C(v) v
+#  define UINT16_C(v) v ## U
+# endif /* __int16_c_suffix */
+#endif /* __int_least16_t */
+
+
+#ifdef __INT8_TYPE__
+# ifdef __INT8_C_SUFFIX__
+#  define __int8_c_suffix __INT8_C_SUFFIX__
+#else
+#  undef  __int8_c_suffix
+# endif /* __INT8_C_SUFFIX__ */
+#endif /* __INT8_TYPE__ */
+
+#ifdef __int_least8_t
+# ifdef __int8_c_suffix
+#  define INT8_C(v) __int_c(v, __int8_c_suffix)
+#  define UINT8_C(v) __uint_c(v, __int8_c_suffix)
+# else
+#  define INT8_C(v) v
+#  define UINT8_C(v) v ## U
+# endif /* __int8_c_suffix */
+#endif /* __int_least8_t */
+
+
+/* C99 7.18.2.1 Limits of exact-width integer types.
+ * C99 7.18.2.2 Limits of minimum-width integer types.
+ * C99 7.18.2.3 Limits of fastest minimum-width integer types.
+ *
+ * The presence of limit macros are completely optional in C99.  This
+ * implementation defines limits for all of the types (exact- and
+ * minimum-width) that it defines above, using the limits of the minimum-width
+ * type for any types that do not have exact-width representations.
+ *
+ * As in the type definitions, this section takes an approach of
+ * successive-shrinking to determine which limits to use for the standard (8,
+ * 16, 32, 64) bit widths when they don't have exact representations. It is
+ * therefore important that the defintions be kept in order of decending
+ * widths.
+ *
+ * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
+ * claims of the C standard (see C++ 18.3.1p2, [cstdint.syn]).
+ */
+
+#ifdef __INT64_TYPE__
+# define INT64_MAX           INT64_C( 9223372036854775807)
+# define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
+# define UINT64_MAX         UINT64_C(18446744073709551615)
+# define __INT_LEAST64_MIN   INT64_MIN
+# define __INT_LEAST64_MAX   INT64_MAX
+# define __UINT_LEAST64_MAX UINT64_MAX
+# define __INT_LEAST32_MIN   INT64_MIN
+# define __INT_LEAST32_MAX   INT64_MAX
+# define __UINT_LEAST32_MAX UINT64_MAX
+# define __INT_LEAST16_MIN   INT64_MIN
+# define __INT_LEAST16_MAX   INT64_MAX
+# define __UINT_LEAST16_MAX UINT64_MAX
+# define __INT_LEAST8_MIN    INT64_MIN
+# define __INT_LEAST8_MAX    INT64_MAX
+# define __UINT_LEAST8_MAX  UINT64_MAX
+#endif /* __INT64_TYPE__ */
+
+#ifdef __INT_LEAST64_MIN
+# define INT_LEAST64_MIN   __INT_LEAST64_MIN
+# define INT_LEAST64_MAX   __INT_LEAST64_MAX
+# define UINT_LEAST64_MAX __UINT_LEAST64_MAX
+# define INT_FAST64_MIN    __INT_LEAST64_MIN
+# define INT_FAST64_MAX    __INT_LEAST64_MAX
+# define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+#endif /* __INT_LEAST64_MIN */
+
+
+#ifdef __INT56_TYPE__
+# define INT56_MAX           INT56_C(36028797018963967)
+# define INT56_MIN         (-INT56_C(36028797018963967)-1)
+# define UINT56_MAX         UINT56_C(72057594037927935)
+# define INT_LEAST56_MIN     INT56_MIN
+# define INT_LEAST56_MAX     INT56_MAX
+# define UINT_LEAST56_MAX   UINT56_MAX
+# define INT_FAST56_MIN      INT56_MIN
+# define INT_FAST56_MAX      INT56_MAX
+# define UINT_FAST56_MAX    UINT56_MAX
+# define __INT_LEAST32_MIN   INT56_MIN
+# define __INT_LEAST32_MAX   INT56_MAX
+# define __UINT_LEAST32_MAX UINT56_MAX
+# define __INT_LEAST16_MIN   INT56_MIN
+# define __INT_LEAST16_MAX   INT56_MAX
+# define __UINT_LEAST16_MAX UINT56_MAX
+# define __INT_LEAST8_MIN    INT56_MIN
+# define __INT_LEAST8_MAX    INT56_MAX
+# define __UINT_LEAST8_MAX  UINT56_MAX
+#endif /* __INT56_TYPE__ */
+
+
+#ifdef __INT48_TYPE__
+# define INT48_MAX           INT48_C(140737488355327)
+# define INT48_MIN         (-INT48_C(140737488355327)-1)
+# define UINT48_MAX         UINT48_C(281474976710655)
+# define INT_LEAST48_MIN     INT48_MIN
+# define INT_LEAST48_MAX     INT48_MAX
+# define UINT_LEAST48_MAX   UINT48_MAX
+# define INT_FAST48_MIN      INT48_MIN
+# define INT_FAST48_MAX      INT48_MAX
+# define UINT_FAST48_MAX    UINT48_MAX
+# define __INT_LEAST32_MIN   INT48_MIN
+# define __INT_LEAST32_MAX   INT48_MAX
+# define __UINT_LEAST32_MAX UINT48_MAX
+# define __INT_LEAST16_MIN   INT48_MIN
+# define __INT_LEAST16_MAX   INT48_MAX
+# define __UINT_LEAST16_MAX UINT48_MAX
+# define __INT_LEAST8_MIN    INT48_MIN
+# define __INT_LEAST8_MAX    INT48_MAX
+# define __UINT_LEAST8_MAX  UINT48_MAX
+#endif /* __INT48_TYPE__ */
+
+
+#ifdef __INT40_TYPE__
+# define INT40_MAX           INT40_C(549755813887)
+# define INT40_MIN         (-INT40_C(549755813887)-1)
+# define UINT40_MAX         UINT40_C(1099511627775)
+# define INT_LEAST40_MIN     INT40_MIN
+# define INT_LEAST40_MAX     INT40_MAX
+# define UINT_LEAST40_MAX   UINT40_MAX
+# define INT_FAST40_MIN      INT40_MIN
+# define INT_FAST40_MAX      INT40_MAX
+# define UINT_FAST40_MAX    UINT40_MAX
+# define __INT_LEAST32_MIN   INT40_MIN
+# define __INT_LEAST32_MAX   INT40_MAX
+# define __UINT_LEAST32_MAX UINT40_MAX
+# define __INT_LEAST16_MIN   INT40_MIN
+# define __INT_LEAST16_MAX   INT40_MAX
+# define __UINT_LEAST16_MAX UINT40_MAX
+# define __INT_LEAST8_MIN    INT40_MIN
+# define __INT_LEAST8_MAX    INT40_MAX
+# define __UINT_LEAST8_MAX  UINT40_MAX
+#endif /* __INT40_TYPE__ */
+
+
+#ifdef __INT32_TYPE__
+# define INT32_MAX           INT32_C(2147483647)
+# define INT32_MIN         (-INT32_C(2147483647)-1)
+# define UINT32_MAX         UINT32_C(4294967295)
+# define __INT_LEAST32_MIN   INT32_MIN
+# define __INT_LEAST32_MAX   INT32_MAX
+# define __UINT_LEAST32_MAX UINT32_MAX
+# define __INT_LEAST16_MIN   INT32_MIN
+# define __INT_LEAST16_MAX   INT32_MAX
+# define __UINT_LEAST16_MAX UINT32_MAX
+# define __INT_LEAST8_MIN    INT32_MIN
+# define __INT_LEAST8_MAX    INT32_MAX
+# define __UINT_LEAST8_MAX  UINT32_MAX
+#endif /* __INT32_TYPE__ */
+
+#ifdef __INT_LEAST32_MIN
+# define INT_LEAST32_MIN   __INT_LEAST32_MIN
+# define INT_LEAST32_MAX   __INT_LEAST32_MAX
+# define UINT_LEAST32_MAX __UINT_LEAST32_MAX
+# define INT_FAST32_MIN    __INT_LEAST32_MIN
+# define INT_FAST32_MAX    __INT_LEAST32_MAX
+# define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+#endif /* __INT_LEAST32_MIN */
+
+
+#ifdef __INT24_TYPE__
+# define INT24_MAX           INT24_C(8388607)
+# define INT24_MIN         (-INT24_C(8388607)-1)
+# define UINT24_MAX         UINT24_C(16777215)
+# define INT_LEAST24_MIN     INT24_MIN
+# define INT_LEAST24_MAX     INT24_MAX
+# define UINT_LEAST24_MAX   UINT24_MAX
+# define INT_FAST24_MIN      INT24_MIN
+# define INT_FAST24_MAX      INT24_MAX
+# define UINT_FAST24_MAX    UINT24_MAX
+# define __INT_LEAST16_MIN   INT24_MIN
+# define __INT_LEAST16_MAX   INT24_MAX
+# define __UINT_LEAST16_MAX UINT24_MAX
+# define __INT_LEAST8_MIN    INT24_MIN
+# define __INT_LEAST8_MAX    INT24_MAX
+# define __UINT_LEAST8_MAX  UINT24_MAX
+#endif /* __INT24_TYPE__ */
+
+
+#ifdef __INT16_TYPE__
+#define INT16_MAX            INT16_C(32767)
+#define INT16_MIN          (-INT16_C(32767)-1)
+#define UINT16_MAX          UINT16_C(65535)
+# define __INT_LEAST16_MIN   INT16_MIN
+# define __INT_LEAST16_MAX   INT16_MAX
+# define __UINT_LEAST16_MAX UINT16_MAX
+# define __INT_LEAST8_MIN    INT16_MIN
+# define __INT_LEAST8_MAX    INT16_MAX
+# define __UINT_LEAST8_MAX  UINT16_MAX
+#endif /* __INT16_TYPE__ */
+
+#ifdef __INT_LEAST16_MIN
+# define INT_LEAST16_MIN   __INT_LEAST16_MIN
+# define INT_LEAST16_MAX   __INT_LEAST16_MAX
+# define UINT_LEAST16_MAX __UINT_LEAST16_MAX
+# define INT_FAST16_MIN    __INT_LEAST16_MIN
+# define INT_FAST16_MAX    __INT_LEAST16_MAX
+# define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+#endif /* __INT_LEAST16_MIN */
+
+
+#ifdef __INT8_TYPE__
+# define INT8_MAX            INT8_C(127)
+# define INT8_MIN          (-INT8_C(127)-1)
+# define UINT8_MAX          UINT8_C(255)
+# define __INT_LEAST8_MIN    INT8_MIN
+# define __INT_LEAST8_MAX    INT8_MAX
+# define __UINT_LEAST8_MAX  UINT8_MAX
+#endif /* __INT8_TYPE__ */
+
+#ifdef __INT_LEAST8_MIN
+# define INT_LEAST8_MIN   __INT_LEAST8_MIN
+# define INT_LEAST8_MAX   __INT_LEAST8_MAX
+# define UINT_LEAST8_MAX __UINT_LEAST8_MAX
+# define INT_FAST8_MIN    __INT_LEAST8_MIN
+# define INT_FAST8_MAX    __INT_LEAST8_MAX
+# define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+#endif /* __INT_LEAST8_MIN */
+
+/* Some utility macros */
+#define  __INTN_MIN(n)  __stdint_join3( INT, n, _MIN)
+#define  __INTN_MAX(n)  __stdint_join3( INT, n, _MAX)
+#define __UINTN_MAX(n)  __stdint_join3(UINT, n, _MAX)
+#define  __INTN_C(n, v) __stdint_join3( INT, n, _C(v))
+#define __UINTN_C(n, v) __stdint_join3(UINT, n, _C(v))
+
+/* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
+/* C99 7.18.3 Limits of other integer types. */
+
+#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
+#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
+#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
+#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
+#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
+#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+
+/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
+ * is enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define   RSIZE_MAX            (SIZE_MAX >> 1)
+#endif
+
+/* C99 7.18.2.5 Limits of greatest-width integer types. */
+#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
+#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
+#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+
+/* C99 7.18.3 Limits of other integer types. */
+#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
+#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
+#ifdef __WINT_UNSIGNED__
+# define WINT_MIN       __UINTN_C(__WINT_WIDTH__, 0)
+# define WINT_MAX       __UINTN_MAX(__WINT_WIDTH__)
+#else
+# define WINT_MIN       __INTN_MIN(__WINT_WIDTH__)
+# define WINT_MAX       __INTN_MAX(__WINT_WIDTH__)
+#endif
+
+#ifndef WCHAR_MAX
+# define WCHAR_MAX __WCHAR_MAX__
+#endif
+#ifndef WCHAR_MIN
+# if __WCHAR_MAX__ == __INTN_MAX(__WCHAR_WIDTH__)
+#  define WCHAR_MIN __INTN_MIN(__WCHAR_WIDTH__)
+# else
+#  define WCHAR_MIN __UINTN_C(__WCHAR_WIDTH__, 0)
+# endif
+#endif
+
+/* 7.18.4.2 Macros for greatest-width integer constants. */
+#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
+#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __CLANG_STDINT_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdnoreturn.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdnoreturn.h
new file mode 100644
index 000000000..a7a301d7e
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/stdnoreturn.h
@@ -0,0 +1,30 @@
+/*===---- stdnoreturn.h - Standard header for noreturn macro ---------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDNORETURN_H
+#define __STDNORETURN_H
+
+#define noreturn _Noreturn
+#define __noreturn_is_defined 1
+
+#endif /* __STDNORETURN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tbmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tbmintrin.h
new file mode 100644
index 000000000..1d0d746a8
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tbmintrin.h
@@ -0,0 +1,154 @@
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+
+#define __bextri_u32(a, b) \
+  ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
+                                           (unsigned int)(b)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcfill_u32(unsigned int __a)
+{
+  return __a & (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blci_u32(unsigned int __a)
+{
+  return __a | ~(__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcic_u32(unsigned int __a)
+{
+  return ~__a & (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcmsk_u32(unsigned int __a)
+{
+  return __a ^ (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcs_u32(unsigned int __a)
+{
+  return __a | (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsfill_u32(unsigned int __a)
+{
+  return __a | (__a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsic_u32(unsigned int __a)
+{
+  return ~__a | (__a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__t1mskc_u32(unsigned int __a)
+{
+  return ~__a | (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__tzmsk_u32(unsigned int __a)
+{
+  return ~__a & (__a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) \
+  ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
+                                                 (unsigned long long)(b)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcfill_u64(unsigned long long __a)
+{
+  return __a & (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blci_u64(unsigned long long __a)
+{
+  return __a | ~(__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcic_u64(unsigned long long __a)
+{
+  return ~__a & (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcmsk_u64(unsigned long long __a)
+{
+  return __a ^ (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcs_u64(unsigned long long __a)
+{
+  return __a | (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsfill_u64(unsigned long long __a)
+{
+  return __a | (__a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsic_u64(unsigned long long __a)
+{
+  return ~__a | (__a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__t1mskc_u64(unsigned long long __a)
+{
+  return ~__a | (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__tzmsk_u64(unsigned long long __a)
+{
+  return ~__a & (__a - 1);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TBMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tgmath.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tgmath.h
new file mode 100644
index 000000000..318e1185f
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tgmath.h
@@ -0,0 +1,1374 @@
+/*===---- tgmath.h - Standard header for type generic math ----------------===*\
+ *
+ * Copyright (c) 2009 Howard Hinnant
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __TGMATH_H
+#define __TGMATH_H
+
+/* C99 7.22 Type-generic math <tgmath.h>. */
+#include <math.h>
+
+/* C++ handles type genericity with overloading in math.h. */
+#ifndef __cplusplus
+#include <complex.h>
+
+#define _TG_ATTRSp __attribute__((__overloadable__))
+#define _TG_ATTRS __attribute__((__overloadable__, __always_inline__))
+
+// promotion
+
+typedef void _Argument_type_is_not_arithmetic;
+static _Argument_type_is_not_arithmetic __tg_promote(...)
+  __attribute__((__unavailable__,__overloadable__));
+static double               _TG_ATTRSp __tg_promote(int);
+static double               _TG_ATTRSp __tg_promote(unsigned int);
+static double               _TG_ATTRSp __tg_promote(long);
+static double               _TG_ATTRSp __tg_promote(unsigned long);
+static double               _TG_ATTRSp __tg_promote(long long);
+static double               _TG_ATTRSp __tg_promote(unsigned long long);
+static float                _TG_ATTRSp __tg_promote(float);
+static double               _TG_ATTRSp __tg_promote(double);
+static long double          _TG_ATTRSp __tg_promote(long double);
+static float _Complex       _TG_ATTRSp __tg_promote(float _Complex);
+static double _Complex      _TG_ATTRSp __tg_promote(double _Complex);
+static long double _Complex _TG_ATTRSp __tg_promote(long double _Complex);
+
+#define __tg_promote1(__x)           (__typeof__(__tg_promote(__x)))
+#define __tg_promote2(__x, __y)      (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y)))
+#define __tg_promote3(__x, __y, __z) (__typeof__(__tg_promote(__x) + \
+                                                 __tg_promote(__y) + \
+                                                 __tg_promote(__z)))
+
+// acos
+
+static float
+    _TG_ATTRS
+    __tg_acos(float __x) {return acosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acos(double __x) {return acos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acos(long double __x) {return acosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acos(float _Complex __x) {return cacosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acos(double _Complex __x) {return cacos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acos(long double _Complex __x) {return cacosl(__x);}
+
+#undef acos
+#define acos(__x) __tg_acos(__tg_promote1((__x))(__x))
+
+// asin
+
+static float
+    _TG_ATTRS
+    __tg_asin(float __x) {return asinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asin(double __x) {return asin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asin(long double __x) {return asinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asin(float _Complex __x) {return casinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asin(double _Complex __x) {return casin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asin(long double _Complex __x) {return casinl(__x);}
+
+#undef asin
+#define asin(__x) __tg_asin(__tg_promote1((__x))(__x))
+
+// atan
+
+static float
+    _TG_ATTRS
+    __tg_atan(float __x) {return atanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atan(double __x) {return atan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan(long double __x) {return atanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atan(float _Complex __x) {return catanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atan(double _Complex __x) {return catan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atan(long double _Complex __x) {return catanl(__x);}
+
+#undef atan
+#define atan(__x) __tg_atan(__tg_promote1((__x))(__x))
+
+// acosh
+
+static float
+    _TG_ATTRS
+    __tg_acosh(float __x) {return acoshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_acosh(double __x) {return acosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_acosh(long double __x) {return acoshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_acosh(float _Complex __x) {return cacoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_acosh(double _Complex __x) {return cacosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_acosh(long double _Complex __x) {return cacoshl(__x);}
+
+#undef acosh
+#define acosh(__x) __tg_acosh(__tg_promote1((__x))(__x))
+
+// asinh
+
+static float
+    _TG_ATTRS
+    __tg_asinh(float __x) {return asinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_asinh(double __x) {return asinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_asinh(long double __x) {return asinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_asinh(float _Complex __x) {return casinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_asinh(double _Complex __x) {return casinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_asinh(long double _Complex __x) {return casinhl(__x);}
+
+#undef asinh
+#define asinh(__x) __tg_asinh(__tg_promote1((__x))(__x))
+
+// atanh
+
+static float
+    _TG_ATTRS
+    __tg_atanh(float __x) {return atanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_atanh(double __x) {return atanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_atanh(long double __x) {return atanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_atanh(float _Complex __x) {return catanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_atanh(double _Complex __x) {return catanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_atanh(long double _Complex __x) {return catanhl(__x);}
+
+#undef atanh
+#define atanh(__x) __tg_atanh(__tg_promote1((__x))(__x))
+
+// cos
+
+static float
+    _TG_ATTRS
+    __tg_cos(float __x) {return cosf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cos(double __x) {return cos(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cos(long double __x) {return cosl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cos(float _Complex __x) {return ccosf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cos(double _Complex __x) {return ccos(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cos(long double _Complex __x) {return ccosl(__x);}
+
+#undef cos
+#define cos(__x) __tg_cos(__tg_promote1((__x))(__x))
+
+// sin
+
+static float
+    _TG_ATTRS
+    __tg_sin(float __x) {return sinf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sin(double __x) {return sin(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sin(long double __x) {return sinl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sin(float _Complex __x) {return csinf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sin(double _Complex __x) {return csin(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sin(long double _Complex __x) {return csinl(__x);}
+
+#undef sin
+#define sin(__x) __tg_sin(__tg_promote1((__x))(__x))
+
+// tan
+
+static float
+    _TG_ATTRS
+    __tg_tan(float __x) {return tanf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tan(double __x) {return tan(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tan(long double __x) {return tanl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tan(float _Complex __x) {return ctanf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tan(double _Complex __x) {return ctan(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tan(long double _Complex __x) {return ctanl(__x);}
+
+#undef tan
+#define tan(__x) __tg_tan(__tg_promote1((__x))(__x))
+
+// cosh
+
+static float
+    _TG_ATTRS
+    __tg_cosh(float __x) {return coshf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cosh(double __x) {return cosh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cosh(long double __x) {return coshl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cosh(float _Complex __x) {return ccoshf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cosh(double _Complex __x) {return ccosh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cosh(long double _Complex __x) {return ccoshl(__x);}
+
+#undef cosh
+#define cosh(__x) __tg_cosh(__tg_promote1((__x))(__x))
+
+// sinh
+
+static float
+    _TG_ATTRS
+    __tg_sinh(float __x) {return sinhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sinh(double __x) {return sinh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sinh(long double __x) {return sinhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sinh(float _Complex __x) {return csinhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sinh(double _Complex __x) {return csinh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sinh(long double _Complex __x) {return csinhl(__x);}
+
+#undef sinh
+#define sinh(__x) __tg_sinh(__tg_promote1((__x))(__x))
+
+// tanh
+
+static float
+    _TG_ATTRS
+    __tg_tanh(float __x) {return tanhf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tanh(double __x) {return tanh(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tanh(long double __x) {return tanhl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_tanh(float _Complex __x) {return ctanhf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_tanh(double _Complex __x) {return ctanh(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_tanh(long double _Complex __x) {return ctanhl(__x);}
+
+#undef tanh
+#define tanh(__x) __tg_tanh(__tg_promote1((__x))(__x))
+
+// exp
+
+static float
+    _TG_ATTRS
+    __tg_exp(float __x) {return expf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp(double __x) {return exp(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp(long double __x) {return expl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_exp(float _Complex __x) {return cexpf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_exp(double _Complex __x) {return cexp(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_exp(long double _Complex __x) {return cexpl(__x);}
+
+#undef exp
+#define exp(__x) __tg_exp(__tg_promote1((__x))(__x))
+
+// log
+
+static float
+    _TG_ATTRS
+    __tg_log(float __x) {return logf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log(double __x) {return log(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log(long double __x) {return logl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_log(float _Complex __x) {return clogf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_log(double _Complex __x) {return clog(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_log(long double _Complex __x) {return clogl(__x);}
+
+#undef log
+#define log(__x) __tg_log(__tg_promote1((__x))(__x))
+
+// pow
+
+static float
+    _TG_ATTRS
+    __tg_pow(float __x, float __y) {return powf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_pow(double __x, double __y) {return pow(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_pow(long double __x, long double __y) {return powl(__x, __y);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_pow(float _Complex __x, float _Complex __y) {return cpowf(__x, __y);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_pow(double _Complex __x, double _Complex __y) {return cpow(__x, __y);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_pow(long double _Complex __x, long double _Complex __y)
+    {return cpowl(__x, __y);}
+
+#undef pow
+#define pow(__x, __y) __tg_pow(__tg_promote2((__x), (__y))(__x), \
+                               __tg_promote2((__x), (__y))(__y))
+
+// sqrt
+
+static float
+    _TG_ATTRS
+    __tg_sqrt(float __x) {return sqrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_sqrt(double __x) {return sqrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_sqrt(long double __x) {return sqrtl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_sqrt(float _Complex __x) {return csqrtf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_sqrt(double _Complex __x) {return csqrt(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_sqrt(long double _Complex __x) {return csqrtl(__x);}
+
+#undef sqrt
+#define sqrt(__x) __tg_sqrt(__tg_promote1((__x))(__x))
+
+// fabs
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float __x) {return fabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double __x) {return fabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double __x) {return fabsl(__x);}
+
+static float
+    _TG_ATTRS
+    __tg_fabs(float _Complex __x) {return cabsf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_fabs(double _Complex __x) {return cabs(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_fabs(long double _Complex __x) {return cabsl(__x);}
+
+#undef fabs
+#define fabs(__x) __tg_fabs(__tg_promote1((__x))(__x))
+
+// atan2
+
+static float
+    _TG_ATTRS
+    __tg_atan2(float __x, float __y) {return atan2f(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_atan2(double __x, double __y) {return atan2(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_atan2(long double __x, long double __y) {return atan2l(__x, __y);}
+
+#undef atan2
+#define atan2(__x, __y) __tg_atan2(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// cbrt
+
+static float
+    _TG_ATTRS
+    __tg_cbrt(float __x) {return cbrtf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cbrt(double __x) {return cbrt(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cbrt(long double __x) {return cbrtl(__x);}
+
+#undef cbrt
+#define cbrt(__x) __tg_cbrt(__tg_promote1((__x))(__x))
+
+// ceil
+
+static float
+    _TG_ATTRS
+    __tg_ceil(float __x) {return ceilf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_ceil(double __x) {return ceil(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_ceil(long double __x) {return ceill(__x);}
+
+#undef ceil
+#define ceil(__x) __tg_ceil(__tg_promote1((__x))(__x))
+
+// copysign
+
+static float
+    _TG_ATTRS
+    __tg_copysign(float __x, float __y) {return copysignf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_copysign(double __x, double __y) {return copysign(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_copysign(long double __x, long double __y) {return copysignl(__x, __y);}
+
+#undef copysign
+#define copysign(__x, __y) __tg_copysign(__tg_promote2((__x), (__y))(__x), \
+                                         __tg_promote2((__x), (__y))(__y))
+
+// erf
+
+static float
+    _TG_ATTRS
+    __tg_erf(float __x) {return erff(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erf(double __x) {return erf(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erf(long double __x) {return erfl(__x);}
+
+#undef erf
+#define erf(__x) __tg_erf(__tg_promote1((__x))(__x))
+
+// erfc
+
+static float
+    _TG_ATTRS
+    __tg_erfc(float __x) {return erfcf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_erfc(double __x) {return erfc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_erfc(long double __x) {return erfcl(__x);}
+
+#undef erfc
+#define erfc(__x) __tg_erfc(__tg_promote1((__x))(__x))
+
+// exp2
+
+static float
+    _TG_ATTRS
+    __tg_exp2(float __x) {return exp2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_exp2(double __x) {return exp2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_exp2(long double __x) {return exp2l(__x);}
+
+#undef exp2
+#define exp2(__x) __tg_exp2(__tg_promote1((__x))(__x))
+
+// expm1
+
+static float
+    _TG_ATTRS
+    __tg_expm1(float __x) {return expm1f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_expm1(double __x) {return expm1(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_expm1(long double __x) {return expm1l(__x);}
+
+#undef expm1
+#define expm1(__x) __tg_expm1(__tg_promote1((__x))(__x))
+
+// fdim
+
+static float
+    _TG_ATTRS
+    __tg_fdim(float __x, float __y) {return fdimf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fdim(double __x, double __y) {return fdim(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fdim(long double __x, long double __y) {return fdiml(__x, __y);}
+
+#undef fdim
+#define fdim(__x, __y) __tg_fdim(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// floor
+
+static float
+    _TG_ATTRS
+    __tg_floor(float __x) {return floorf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_floor(double __x) {return floor(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_floor(long double __x) {return floorl(__x);}
+
+#undef floor
+#define floor(__x) __tg_floor(__tg_promote1((__x))(__x))
+
+// fma
+
+static float
+    _TG_ATTRS
+    __tg_fma(float __x, float __y, float __z)
+    {return fmaf(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_fma(double __x, double __y, double __z)
+    {return fma(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_fma(long double __x,long double __y, long double __z)
+    {return fmal(__x, __y, __z);}
+
+#undef fma
+#define fma(__x, __y, __z)                                \
+        __tg_fma(__tg_promote3((__x), (__y), (__z))(__x), \
+                 __tg_promote3((__x), (__y), (__z))(__y), \
+                 __tg_promote3((__x), (__y), (__z))(__z))
+
+// fmax
+
+static float
+    _TG_ATTRS
+    __tg_fmax(float __x, float __y) {return fmaxf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmax(double __x, double __y) {return fmax(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmax(long double __x, long double __y) {return fmaxl(__x, __y);}
+
+#undef fmax
+#define fmax(__x, __y) __tg_fmax(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmin
+
+static float
+    _TG_ATTRS
+    __tg_fmin(float __x, float __y) {return fminf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmin(double __x, double __y) {return fmin(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmin(long double __x, long double __y) {return fminl(__x, __y);}
+
+#undef fmin
+#define fmin(__x, __y) __tg_fmin(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// fmod
+
+static float
+    _TG_ATTRS
+    __tg_fmod(float __x, float __y) {return fmodf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_fmod(double __x, double __y) {return fmod(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_fmod(long double __x, long double __y) {return fmodl(__x, __y);}
+
+#undef fmod
+#define fmod(__x, __y) __tg_fmod(__tg_promote2((__x), (__y))(__x), \
+                                 __tg_promote2((__x), (__y))(__y))
+
+// frexp
+
+static float
+    _TG_ATTRS
+    __tg_frexp(float __x, int* __y) {return frexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_frexp(double __x, int* __y) {return frexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_frexp(long double __x, int* __y) {return frexpl(__x, __y);}
+
+#undef frexp
+#define frexp(__x, __y) __tg_frexp(__tg_promote1((__x))(__x), __y)
+
+// hypot
+
+static float
+    _TG_ATTRS
+    __tg_hypot(float __x, float __y) {return hypotf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_hypot(double __x, double __y) {return hypot(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_hypot(long double __x, long double __y) {return hypotl(__x, __y);}
+
+#undef hypot
+#define hypot(__x, __y) __tg_hypot(__tg_promote2((__x), (__y))(__x), \
+                                   __tg_promote2((__x), (__y))(__y))
+
+// ilogb
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(float __x) {return ilogbf(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(double __x) {return ilogb(__x);}
+
+static int
+    _TG_ATTRS
+    __tg_ilogb(long double __x) {return ilogbl(__x);}
+
+#undef ilogb
+#define ilogb(__x) __tg_ilogb(__tg_promote1((__x))(__x))
+
+// ldexp
+
+static float
+    _TG_ATTRS
+    __tg_ldexp(float __x, int __y) {return ldexpf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_ldexp(double __x, int __y) {return ldexp(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_ldexp(long double __x, int __y) {return ldexpl(__x, __y);}
+
+#undef ldexp
+#define ldexp(__x, __y) __tg_ldexp(__tg_promote1((__x))(__x), __y)
+
+// lgamma
+
+static float
+    _TG_ATTRS
+    __tg_lgamma(float __x) {return lgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_lgamma(double __x) {return lgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_lgamma(long double __x) {return lgammal(__x);}
+
+#undef lgamma
+#define lgamma(__x) __tg_lgamma(__tg_promote1((__x))(__x))
+
+// llrint
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(float __x) {return llrintf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(double __x) {return llrint(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llrint(long double __x) {return llrintl(__x);}
+
+#undef llrint
+#define llrint(__x) __tg_llrint(__tg_promote1((__x))(__x))
+
+// llround
+
+static long long
+    _TG_ATTRS
+    __tg_llround(float __x) {return llroundf(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(double __x) {return llround(__x);}
+
+static long long
+    _TG_ATTRS
+    __tg_llround(long double __x) {return llroundl(__x);}
+
+#undef llround
+#define llround(__x) __tg_llround(__tg_promote1((__x))(__x))
+
+// log10
+
+static float
+    _TG_ATTRS
+    __tg_log10(float __x) {return log10f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log10(double __x) {return log10(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log10(long double __x) {return log10l(__x);}
+
+#undef log10
+#define log10(__x) __tg_log10(__tg_promote1((__x))(__x))
+
+// log1p
+
+static float
+    _TG_ATTRS
+    __tg_log1p(float __x) {return log1pf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log1p(double __x) {return log1p(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log1p(long double __x) {return log1pl(__x);}
+
+#undef log1p
+#define log1p(__x) __tg_log1p(__tg_promote1((__x))(__x))
+
+// log2
+
+static float
+    _TG_ATTRS
+    __tg_log2(float __x) {return log2f(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_log2(double __x) {return log2(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_log2(long double __x) {return log2l(__x);}
+
+#undef log2
+#define log2(__x) __tg_log2(__tg_promote1((__x))(__x))
+
+// logb
+
+static float
+    _TG_ATTRS
+    __tg_logb(float __x) {return logbf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_logb(double __x) {return logb(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_logb(long double __x) {return logbl(__x);}
+
+#undef logb
+#define logb(__x) __tg_logb(__tg_promote1((__x))(__x))
+
+// lrint
+
+static long
+    _TG_ATTRS
+    __tg_lrint(float __x) {return lrintf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(double __x) {return lrint(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lrint(long double __x) {return lrintl(__x);}
+
+#undef lrint
+#define lrint(__x) __tg_lrint(__tg_promote1((__x))(__x))
+
+// lround
+
+static long
+    _TG_ATTRS
+    __tg_lround(float __x) {return lroundf(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(double __x) {return lround(__x);}
+
+static long
+    _TG_ATTRS
+    __tg_lround(long double __x) {return lroundl(__x);}
+
+#undef lround
+#define lround(__x) __tg_lround(__tg_promote1((__x))(__x))
+
+// nearbyint
+
+static float
+    _TG_ATTRS
+    __tg_nearbyint(float __x) {return nearbyintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_nearbyint(double __x) {return nearbyint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_nearbyint(long double __x) {return nearbyintl(__x);}
+
+#undef nearbyint
+#define nearbyint(__x) __tg_nearbyint(__tg_promote1((__x))(__x))
+
+// nextafter
+
+static float
+    _TG_ATTRS
+    __tg_nextafter(float __x, float __y) {return nextafterf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nextafter(double __x, double __y) {return nextafter(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nextafter(long double __x, long double __y) {return nextafterl(__x, __y);}
+
+#undef nextafter
+#define nextafter(__x, __y) __tg_nextafter(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// nexttoward
+
+static float
+    _TG_ATTRS
+    __tg_nexttoward(float __x, long double __y) {return nexttowardf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_nexttoward(double __x, long double __y) {return nexttoward(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_nexttoward(long double __x, long double __y) {return nexttowardl(__x, __y);}
+
+#undef nexttoward
+#define nexttoward(__x, __y) __tg_nexttoward(__tg_promote1((__x))(__x), (__y))
+
+// remainder
+
+static float
+    _TG_ATTRS
+    __tg_remainder(float __x, float __y) {return remainderf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_remainder(double __x, double __y) {return remainder(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_remainder(long double __x, long double __y) {return remainderl(__x, __y);}
+
+#undef remainder
+#define remainder(__x, __y) __tg_remainder(__tg_promote2((__x), (__y))(__x), \
+                                           __tg_promote2((__x), (__y))(__y))
+
+// remquo
+
+static float
+    _TG_ATTRS
+    __tg_remquo(float __x, float __y, int* __z)
+    {return remquof(__x, __y, __z);}
+
+static double
+    _TG_ATTRS
+    __tg_remquo(double __x, double __y, int* __z)
+    {return remquo(__x, __y, __z);}
+
+static long double
+    _TG_ATTRS
+    __tg_remquo(long double __x,long double __y, int* __z)
+    {return remquol(__x, __y, __z);}
+
+#undef remquo
+#define remquo(__x, __y, __z)                         \
+        __tg_remquo(__tg_promote2((__x), (__y))(__x), \
+                    __tg_promote2((__x), (__y))(__y), \
+                    (__z))
+
+// rint
+
+static float
+    _TG_ATTRS
+    __tg_rint(float __x) {return rintf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_rint(double __x) {return rint(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_rint(long double __x) {return rintl(__x);}
+
+#undef rint
+#define rint(__x) __tg_rint(__tg_promote1((__x))(__x))
+
+// round
+
+static float
+    _TG_ATTRS
+    __tg_round(float __x) {return roundf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_round(double __x) {return round(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_round(long double __x) {return roundl(__x);}
+
+#undef round
+#define round(__x) __tg_round(__tg_promote1((__x))(__x))
+
+// scalbn
+
+static float
+    _TG_ATTRS
+    __tg_scalbn(float __x, int __y) {return scalbnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbn(double __x, int __y) {return scalbn(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbn(long double __x, int __y) {return scalbnl(__x, __y);}
+
+#undef scalbn
+#define scalbn(__x, __y) __tg_scalbn(__tg_promote1((__x))(__x), __y)
+
+// scalbln
+
+static float
+    _TG_ATTRS
+    __tg_scalbln(float __x, long __y) {return scalblnf(__x, __y);}
+
+static double
+    _TG_ATTRS
+    __tg_scalbln(double __x, long __y) {return scalbln(__x, __y);}
+
+static long double
+    _TG_ATTRS
+    __tg_scalbln(long double __x, long __y) {return scalblnl(__x, __y);}
+
+#undef scalbln
+#define scalbln(__x, __y) __tg_scalbln(__tg_promote1((__x))(__x), __y)
+
+// tgamma
+
+static float
+    _TG_ATTRS
+    __tg_tgamma(float __x) {return tgammaf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_tgamma(double __x) {return tgamma(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_tgamma(long double __x) {return tgammal(__x);}
+
+#undef tgamma
+#define tgamma(__x) __tg_tgamma(__tg_promote1((__x))(__x))
+
+// trunc
+
+static float
+    _TG_ATTRS
+    __tg_trunc(float __x) {return truncf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_trunc(double __x) {return trunc(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_trunc(long double __x) {return truncl(__x);}
+
+#undef trunc
+#define trunc(__x) __tg_trunc(__tg_promote1((__x))(__x))
+
+// carg
+
+static float
+    _TG_ATTRS
+    __tg_carg(float __x) {return atan2f(0.F, __x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double __x) {return atan2(0., __x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double __x) {return atan2l(0.L, __x);}
+
+static float
+    _TG_ATTRS
+    __tg_carg(float _Complex __x) {return cargf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_carg(double _Complex __x) {return carg(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_carg(long double _Complex __x) {return cargl(__x);}
+
+#undef carg
+#define carg(__x) __tg_carg(__tg_promote1((__x))(__x))
+
+// cimag
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float __x) {return 0;}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double __x) {return 0;}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double __x) {return 0;}
+
+static float
+    _TG_ATTRS
+    __tg_cimag(float _Complex __x) {return cimagf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_cimag(double _Complex __x) {return cimag(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_cimag(long double _Complex __x) {return cimagl(__x);}
+
+#undef cimag
+#define cimag(__x) __tg_cimag(__tg_promote1((__x))(__x))
+
+// conj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float __x) {return __x;}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double __x) {return __x;}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double __x) {return __x;}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_conj(float _Complex __x) {return conjf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_conj(double _Complex __x) {return conj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_conj(long double _Complex __x) {return conjl(__x);}
+
+#undef conj
+#define conj(__x) __tg_conj(__tg_promote1((__x))(__x))
+
+// cproj
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double __x) {return cprojl(__x);}
+
+static float _Complex
+    _TG_ATTRS
+    __tg_cproj(float _Complex __x) {return cprojf(__x);}
+
+static double _Complex
+    _TG_ATTRS
+    __tg_cproj(double _Complex __x) {return cproj(__x);}
+
+static long double _Complex
+    _TG_ATTRS
+    __tg_cproj(long double _Complex __x) {return cprojl(__x);}
+
+#undef cproj
+#define cproj(__x) __tg_cproj(__tg_promote1((__x))(__x))
+
+// creal
+
+static float
+    _TG_ATTRS
+    __tg_creal(float __x) {return __x;}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double __x) {return __x;}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double __x) {return __x;}
+
+static float
+    _TG_ATTRS
+    __tg_creal(float _Complex __x) {return crealf(__x);}
+
+static double
+    _TG_ATTRS
+    __tg_creal(double _Complex __x) {return creal(__x);}
+
+static long double
+    _TG_ATTRS
+    __tg_creal(long double _Complex __x) {return creall(__x);}
+
+#undef creal
+#define creal(__x) __tg_creal(__tg_promote1((__x))(__x))
+
+#undef _TG_ATTRSp
+#undef _TG_ATTRS
+
+#endif /* __cplusplus */
+#endif /* __TGMATH_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tmmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tmmintrin.h
new file mode 100644
index 000000000..80664043a
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/tmmintrin.h
@@ -0,0 +1,773 @@
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
+
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSB instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi8(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSB instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi8(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi16(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi16(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 64-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_abs_pi32(__m64 __a)
+{
+    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi32(__m128i __a)
+{
+    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+}
+
+/// \brief Concatenates the two 128-bit integer vector operands, and
+///    right-shifts the result by the number of bytes specified in the immediate
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+///    A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param b
+///    A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param n
+///    An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 128-bit integer vector containing the concatenated right-shifted
+///    value.
+#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                     (__v16qi)(__m128i)(b), (n)); })
+
+/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
+///    the result by the number of bytes specified in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+///    A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param b
+///    A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param n
+///    An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 64-bit integer vector containing the concatenated right-shifted
+///    value.
+#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
+  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
+///    operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
+///    operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+///    sums of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+///    sums of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
+///    of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
+///    of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
+///    of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
+///    of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+///    saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+///    differences of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+///    saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands. The
+///    horizontal differences between the values are stored in the upper bits of
+///    the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+///    differences of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+///    values contained in the first source operand and packed 8-bit signed
+///    integer values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums to
+///    the corresponding bits in the destination. For example, bits [7:0] of
+///    both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the first source operand.
+/// \param __b
+///    A 128-bit integer vector containing the second source operand.
+/// \returns A 128-bit integer vector containing the sums of products of both
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
+///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
+///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
+///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
+///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+///    values contained in the first source operand and packed 8-bit signed
+///    integer values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums to
+///    the corresponding bits in the destination. For example, bits [7:0] of
+///    both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMADDUBSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the first source operand.
+/// \param __b
+///    A 64-bit integer vector containing the second source operand.
+/// \returns A 64-bit integer vector containing the sums of products of both
+///    operands: \n
+///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULHRSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
+///    products of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULHRSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
+///    products of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSHUFB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes corresponding to
+///    positions in the destination:
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
+///    0: Copy the selected source byte to the corresponding byte in the
+///    destination. \n
+///    Bits [6:4] Reserved.  \n
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 128-bit integer vector containing the copied or cleared values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSHUFB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes corresponding to
+///    positions in the destination:
+///    Bit 7: \n
+///    1: Clear the corresponding byte in the destination. \n
+///    0: Copy the selected source byte to the corresponding byte in the
+///    destination. \n
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 64-bit integer vector containing the copied or cleared values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    byte in the second source is negative, calculate the two's complement of
+///    the corresponding byte in the first source, and write that value to the
+///    destination. If the byte in the second source is positive, copy the
+///    corresponding byte from the first source to the destination. If the byte
+///    in the second source is zero, clear the corresponding byte in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    word in the second source is negative, calculate the two's complement of
+///    the corresponding word in the first source, and write that value to the
+///    destination. If the word in the second source is positive, copy the
+///    corresponding word from the first source to the destination. If the word
+///    in the second source is zero, clear the corresponding word in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control words corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    doubleword in the second source is negative, calculate the two's
+///    complement of the corresponding word in the first source, and write that
+///    value to the destination. If the doubleword in the second source is
+///    positive, copy the corresponding word from the first source to the
+///    destination. If the doubleword in the second source is zero, clear the
+///    corresponding word in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGND instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control doublewords corresponding to
+///    positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+/// \brief For each 8-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    byte in the second source is negative, calculate the two's complement of
+///    the corresponding byte in the first source, and write that value to the
+///    destination. If the byte in the second source is positive, copy the
+///    corresponding byte from the first source to the destination. If the byte
+///    in the second source is zero, clear the corresponding byte in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes corresponding to
+///    positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    word in the second source is negative, calculate the two's complement of
+///    the corresponding word in the first source, and write that value to the
+///    destination. If the word in the second source is positive, copy the
+///    corresponding word from the first source to the destination. If the word
+///    in the second source is zero, clear the corresponding word in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control words corresponding to
+///    positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand: If the
+///    doubleword in the second source is negative, calculate the two's
+///    complement of the corresponding doubleword in the first source, and
+///    write that value to the destination. If the doubleword in the second
+///    source is positive, copy the corresponding doubleword from the first
+///    source to the destination. If the doubleword in the second source is
+///    zero, clear the corresponding doubleword in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGND instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing two control doublewords corresponding
+///    to positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/unwind.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/unwind.h
new file mode 100644
index 000000000..4f74a3478
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/unwind.h
@@ -0,0 +1,299 @@
+/*===---- unwind.h - Stack unwinding ----------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* See "Data Definitions for libgcc_s" in the Linux Standard Base.*/
+
+#ifndef __CLANG_UNWIND_H
+#define __CLANG_UNWIND_H
+
+#if defined(__APPLE__) && __has_include_next(<unwind.h>)
+/* Darwin (from 11.x on) provide an unwind.h. If that's available,
+ * use it. libunwind wraps some of its definitions in #ifdef _GNU_SOURCE,
+ * so define that around the include.*/
+# ifndef _GNU_SOURCE
+#  define _SHOULD_UNDEFINE_GNU_SOURCE
+#  define _GNU_SOURCE
+# endif
+// libunwind's unwind.h reflects the current visibility.  However, Mozilla
+// builds with -fvisibility=hidden and relies on gcc's unwind.h to reset the
+// visibility to default and export its contents.  gcc also allows users to
+// override its override by #defining HIDE_EXPORTS (but note, this only obeys
+// the user's -fvisibility setting; it doesn't hide any exports on its own).  We
+// imitate gcc's header here:
+# ifdef HIDE_EXPORTS
+#  include_next <unwind.h>
+# else
+#  pragma GCC visibility push(default)
+#  include_next <unwind.h>
+#  pragma GCC visibility pop
+# endif
+# ifdef _SHOULD_UNDEFINE_GNU_SOURCE
+#  undef _GNU_SOURCE
+#  undef _SHOULD_UNDEFINE_GNU_SOURCE
+# endif
+#else
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* It is a bit strange for a header to play with the visibility of the
+   symbols it declares, but this matches gcc's behavior and some programs
+   depend on it */
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+typedef uintptr_t _Unwind_Word;
+typedef intptr_t _Unwind_Sword;
+typedef uintptr_t _Unwind_Ptr;
+typedef uintptr_t _Unwind_Internal_Ptr;
+typedef uint64_t _Unwind_Exception_Class;
+
+typedef intptr_t _sleb128_t;
+typedef uintptr_t _uleb128_t;
+
+struct _Unwind_Context;
+struct _Unwind_Exception;
+typedef enum {
+  _URC_NO_REASON = 0,
+#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
+    !defined(__ARM_DWARF_EH__)
+  _URC_OK = 0, /* used by ARM EHABI */
+#endif
+  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+
+  _URC_FATAL_PHASE2_ERROR = 2,
+  _URC_FATAL_PHASE1_ERROR = 3,
+  _URC_NORMAL_STOP = 4,
+
+  _URC_END_OF_STACK = 5,
+  _URC_HANDLER_FOUND = 6,
+  _URC_INSTALL_CONTEXT = 7,
+  _URC_CONTINUE_UNWIND = 8,
+#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
+    !defined(__ARM_DWARF_EH__)
+  _URC_FAILURE = 9 /* used by ARM EHABI */
+#endif
+} _Unwind_Reason_Code;
+
+typedef enum {
+  _UA_SEARCH_PHASE = 1,
+  _UA_CLEANUP_PHASE = 2,
+
+  _UA_HANDLER_FRAME = 4,
+  _UA_FORCE_UNWIND = 8,
+  _UA_END_OF_STACK = 16 /* gcc extension to C++ ABI */
+} _Unwind_Action;
+
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
+                                             struct _Unwind_Exception *);
+
+struct _Unwind_Exception {
+  _Unwind_Exception_Class exception_class;
+  _Unwind_Exception_Cleanup_Fn exception_cleanup;
+  _Unwind_Word private_1;
+  _Unwind_Word private_2;
+  /* The Itanium ABI requires that _Unwind_Exception objects are "double-word
+   * aligned".  GCC has interpreted this to mean "use the maximum useful
+   * alignment for the target"; so do we. */
+} __attribute__((__aligned__));
+
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
+                                               _Unwind_Exception_Class,
+                                               struct _Unwind_Exception *,
+                                               struct _Unwind_Context *,
+                                               void *);
+
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
+    struct _Unwind_Context *);
+typedef _Unwind_Personality_Fn __personality_routine;
+
+typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
+                                                void *);
+
+#if defined(__arm__) && !defined(__APPLE__)
+
+typedef enum {
+  _UVRSC_CORE = 0,        /* integer register */
+  _UVRSC_VFP = 1,         /* vfp */
+  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
+  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+} _Unwind_VRS_RegClass;
+
+typedef enum {
+  _UVRSD_UINT32 = 0,
+  _UVRSD_VFPX = 1,
+  _UVRSD_UINT64 = 3,
+  _UVRSD_FLOAT = 4,
+  _UVRSD_DOUBLE = 5
+} _Unwind_VRS_DataRepresentation;
+
+typedef enum {
+  _UVRSR_OK = 0,
+  _UVRSR_NOT_IMPLEMENTED = 1,
+  _UVRSR_FAILED = 2
+} _Unwind_VRS_Result;
+
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
+typedef uint32_t _Unwind_State;
+#define _US_VIRTUAL_UNWIND_FRAME  ((_Unwind_State)0)
+#define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
+#define _US_UNWIND_FRAME_RESUME   ((_Unwind_State)2)
+#define _US_ACTION_MASK           ((_Unwind_State)3)
+#define _US_FORCE_UNWIND          ((_Unwind_State)8)
+#endif
+
+_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+_Unwind_VRS_Result _Unwind_VRS_Set(struct _Unwind_Context *__context,
+  _Unwind_VRS_RegClass __regclass,
+  uint32_t __regno,
+  _Unwind_VRS_DataRepresentation __representation,
+  void *__valuep);
+
+static __inline__
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *__context, int __index) {
+  _Unwind_Word __value;
+  _Unwind_VRS_Get(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+  return __value;
+}
+
+static __inline__
+void _Unwind_SetGR(struct _Unwind_Context *__context, int __index,
+                   _Unwind_Word __value) {
+  _Unwind_VRS_Set(__context, _UVRSC_CORE, __index, _UVRSD_UINT32, &__value);
+}
+
+static __inline__
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *__context) {
+  _Unwind_Word __ip = _Unwind_GetGR(__context, 15);
+  return __ip & ~(_Unwind_Word)(0x1); /* Remove thumb mode bit. */
+}
+
+static __inline__
+void _Unwind_SetIP(struct _Unwind_Context *__context, _Unwind_Word __value) {
+  _Unwind_Word __thumb_mode_bit = _Unwind_GetGR(__context, 15) & 0x1;
+  _Unwind_SetGR(__context, 15, __value | __thumb_mode_bit);
+}
+#else
+_Unwind_Word _Unwind_GetGR(struct _Unwind_Context *, int);
+void _Unwind_SetGR(struct _Unwind_Context *, int, _Unwind_Word);
+
+_Unwind_Word _Unwind_GetIP(struct _Unwind_Context *);
+void _Unwind_SetIP(struct _Unwind_Context *, _Unwind_Word);
+#endif
+
+
+_Unwind_Word _Unwind_GetIPInfo(struct _Unwind_Context *, int *);
+
+_Unwind_Word _Unwind_GetCFA(struct _Unwind_Context *);
+
+_Unwind_Word _Unwind_GetBSP(struct _Unwind_Context *);
+
+void *_Unwind_GetLanguageSpecificData(struct _Unwind_Context *);
+
+_Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
+
+/* DWARF EH functions; currently not available on Darwin/ARM */
+#if !defined(__APPLE__) || !defined(__arm__)
+
+_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
+                                         _Unwind_Stop_Fn, void *);
+void _Unwind_DeleteException(struct _Unwind_Exception *);
+void _Unwind_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+#endif
+
+_Unwind_Reason_Code _Unwind_Backtrace(_Unwind_Trace_Fn, void *);
+
+/* setjmp(3)/longjmp(3) stuff */
+typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
+
+void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
+void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+                                              _Unwind_Stop_Fn, void *);
+void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+
+void *_Unwind_FindEnclosingFunction(void *);
+
+#ifdef __APPLE__
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *)
+    __attribute__((__unavailable__));
+
+/* Darwin-specific functions */
+void __register_frame(const void *);
+void __deregister_frame(const void *);
+
+struct dwarf_eh_bases {
+  uintptr_t tbase;
+  uintptr_t dbase;
+  uintptr_t func;
+};
+void *_Unwind_Find_FDE(const void *, struct dwarf_eh_bases *);
+
+void __register_frame_info_bases(const void *, void *, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info(const void *, void *) __attribute__((__unavailable__));
+void __register_frame_info_table_bases(const void *, void*, void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_info_table(const void *, void *)
+  __attribute__((__unavailable__));
+void __register_frame_table(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info(const void *) __attribute__((__unavailable__));
+void __deregister_frame_info_bases(const void *)__attribute__((__unavailable__));
+
+#else
+
+_Unwind_Ptr _Unwind_GetDataRelBase(struct _Unwind_Context *);
+_Unwind_Ptr _Unwind_GetTextRelBase(struct _Unwind_Context *);
+
+#endif
+
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __CLANG_UNWIND_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/vadefs.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/vadefs.h
new file mode 100644
index 000000000..7fe9a74e3
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/vadefs.h
@@ -0,0 +1,65 @@
+/* ===-------- vadefs.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we are aiming for MSVC compatibility. */
+#ifndef _MSC_VER
+#include_next <vadefs.h>
+#else
+
+#ifndef __clang_vadefs_h
+#define __clang_vadefs_h
+
+#include_next <vadefs.h>
+
+/* Override macros from vadefs.h with definitions that work with Clang. */
+#ifdef _crt_va_start
+#undef _crt_va_start
+#define _crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef _crt_va_end
+#undef _crt_va_end
+#define _crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef _crt_va_arg
+#undef _crt_va_arg
+#define _crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+/* VS 2015 switched to double underscore names, which is an improvement, but now
+ * we have to intercept those names too.
+ */
+#ifdef __crt_va_start
+#undef __crt_va_start
+#define __crt_va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#ifdef __crt_va_end
+#undef __crt_va_end
+#define __crt_va_end(ap)          __builtin_va_end(ap)
+#endif
+#ifdef __crt_va_arg
+#undef __crt_va_arg
+#define __crt_va_arg(ap, type)    __builtin_va_arg(ap, type)
+#endif
+
+#endif
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/varargs.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/varargs.h
new file mode 100644
index 000000000..b5477d0a6
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/varargs.h
@@ -0,0 +1,26 @@
+/*===---- varargs.h - Variable argument handling -------------------------------------===
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+*===-----------------------------------------------------------------------===
+*/
+#ifndef __VARARGS_H
+#define __VARARGS_H
+  #error "Please use <stdarg.h> instead of <varargs.h>"
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/vecintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/vecintrin.h
new file mode 100644
index 000000000..ca7acb473
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/vecintrin.h
@@ -0,0 +1,8946 @@
+/*===---- vecintrin.h - Vector intrinsics ----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if defined(__s390x__) && defined(__VEC__)
+
+#define __ATTRS_ai __attribute__((__always_inline__))
+#define __ATTRS_o __attribute__((__overloadable__))
+#define __ATTRS_o_ai __attribute__((__overloadable__, __always_inline__))
+
+#define __constant(PARM) \
+  __attribute__((__enable_if__ ((PARM) == (PARM), \
+     "argument must be a constant integer")))
+#define __constant_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH), \
+     "argument must be a constant integer from " #LOW " to " #HIGH)))
+#define __constant_pow2_range(PARM, LOW, HIGH) \
+  __attribute__((__enable_if__ ((PARM) >= (LOW) && (PARM) <= (HIGH) && \
+                                ((PARM) & ((PARM) - 1)) == 0, \
+     "argument must be a constant power of 2 from " #LOW " to " #HIGH)))
+
+/*-- __lcbb -----------------------------------------------------------------*/
+
+extern __ATTRS_o unsigned int
+__lcbb(const void *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define __lcbb(X, Y) ((__typeof__((__lcbb)((X), (Y)))) \
+  __builtin_s390_lcbb((X), __builtin_constant_p((Y))? \
+                           ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : 0) : 0))
+
+/*-- vec_extract ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai signed char
+vec_extract(vector signed char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector bool char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai unsigned char
+vec_extract(vector unsigned char __vec, int __index) {
+  return __vec[__index & 15];
+}
+
+static inline __ATTRS_o_ai signed short
+vec_extract(vector signed short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector bool short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai unsigned short
+vec_extract(vector unsigned short __vec, int __index) {
+  return __vec[__index & 7];
+}
+
+static inline __ATTRS_o_ai signed int
+vec_extract(vector signed int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector bool int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai unsigned int
+vec_extract(vector unsigned int __vec, int __index) {
+  return __vec[__index & 3];
+}
+
+static inline __ATTRS_o_ai signed long long
+vec_extract(vector signed long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector bool long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai unsigned long long
+vec_extract(vector unsigned long long __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+static inline __ATTRS_o_ai double
+vec_extract(vector double __vec, int __index) {
+  return __vec[__index & 1];
+}
+
+/*-- vec_insert -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert(signed char __scalar, vector signed char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
+  vector unsigned char __newvec = (vector unsigned char)__vec;
+  __newvec[__index & 15] = (unsigned char)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert(unsigned char __scalar, vector unsigned char __vec, int __index) {
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert(signed short __scalar, vector signed short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
+  vector unsigned short __newvec = (vector unsigned short)__vec;
+  __newvec[__index & 7] = (unsigned short)__scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert(unsigned short __scalar, vector unsigned short __vec, int __index) {
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert(signed int __scalar, vector signed int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
+  vector unsigned int __newvec = (vector unsigned int)__vec;
+  __newvec[__index & 3] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert(unsigned int __scalar, vector unsigned int __vec, int __index) {
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert(signed long long __scalar, vector signed long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector bool long long __vec,
+           int __index) {
+  vector unsigned long long __newvec = (vector unsigned long long)__vec;
+  __newvec[__index & 1] = __scalar;
+  return __newvec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert(unsigned long long __scalar, vector unsigned long long __vec,
+           int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert(double __scalar, vector double __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_promote ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_promote(signed char __scalar, int __index) {
+  const vector signed char __zero = (vector signed char)0;
+  vector signed char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_promote(unsigned char __scalar, int __index) {
+  const vector unsigned char __zero = (vector unsigned char)0;
+  vector unsigned char __vec = __builtin_shufflevector(__zero, __zero,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 15] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_promote(signed short __scalar, int __index) {
+  const vector signed short __zero = (vector signed short)0;
+  vector signed short __vec = __builtin_shufflevector(__zero, __zero,
+                                -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_promote(unsigned short __scalar, int __index) {
+  const vector unsigned short __zero = (vector unsigned short)0;
+  vector unsigned short __vec = __builtin_shufflevector(__zero, __zero,
+                                  -1, -1, -1, -1, -1, -1, -1, -1);
+  __vec[__index & 7] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_promote(signed int __scalar, int __index) {
+  const vector signed int __zero = (vector signed int)0;
+  vector signed int __vec = __builtin_shufflevector(__zero, __zero,
+                                                    -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_promote(unsigned int __scalar, int __index) {
+  const vector unsigned int __zero = (vector unsigned int)0;
+  vector unsigned int __vec = __builtin_shufflevector(__zero, __zero,
+                                                      -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_promote(signed long long __scalar, int __index) {
+  const vector signed long long __zero = (vector signed long long)0;
+  vector signed long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                          -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_promote(unsigned long long __scalar, int __index) {
+  const vector unsigned long long __zero = (vector unsigned long long)0;
+  vector unsigned long long __vec = __builtin_shufflevector(__zero, __zero,
+                                                            -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_promote(double __scalar, int __index) {
+  const vector double __zero = (vector double)0;
+  vector double __vec = __builtin_shufflevector(__zero, __zero, -1, -1);
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+
+/*-- vec_insert_and_zero ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_insert_and_zero(const signed char *__ptr) {
+  vector signed char __vec = (vector signed char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_insert_and_zero(const unsigned char *__ptr) {
+  vector unsigned char __vec = (vector unsigned char)0;
+  __vec[7] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_insert_and_zero(const signed short *__ptr) {
+  vector signed short __vec = (vector signed short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_insert_and_zero(const unsigned short *__ptr) {
+  vector unsigned short __vec = (vector unsigned short)0;
+  __vec[3] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_insert_and_zero(const signed int *__ptr) {
+  vector signed int __vec = (vector signed int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_insert_and_zero(const unsigned int *__ptr) {
+  vector unsigned int __vec = (vector unsigned int)0;
+  __vec[1] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_insert_and_zero(const signed long long *__ptr) {
+  vector signed long long __vec = (vector signed long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_insert_and_zero(const unsigned long long *__ptr) {
+  vector unsigned long long __vec = (vector unsigned long long)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_insert_and_zero(const double *__ptr) {
+  vector double __vec = (vector double)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+
+/*-- vec_perm ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_perm(vector signed char __a, vector signed char __b,
+         vector unsigned char __c) {
+  return (vector signed char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_perm(vector unsigned char __a, vector unsigned char __b,
+         vector unsigned char __c) {
+  return (vector unsigned char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_perm(vector bool char __a, vector bool char __b,
+         vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_perm(vector signed short __a, vector signed short __b,
+         vector unsigned char __c) {
+  return (vector signed short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_perm(vector unsigned short __a, vector unsigned short __b,
+         vector unsigned char __c) {
+  return (vector unsigned short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_perm(vector bool short __a, vector bool short __b,
+         vector unsigned char __c) {
+  return (vector bool short)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_perm(vector signed int __a, vector signed int __b,
+         vector unsigned char __c) {
+  return (vector signed int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_perm(vector unsigned int __a, vector unsigned int __b,
+         vector unsigned char __c) {
+  return (vector unsigned int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_perm(vector bool int __a, vector bool int __b,
+         vector unsigned char __c) {
+  return (vector bool int)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_perm(vector signed long long __a, vector signed long long __b,
+         vector unsigned char __c) {
+  return (vector signed long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_perm(vector unsigned long long __a, vector unsigned long long __b,
+         vector unsigned char __c) {
+  return (vector unsigned long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_perm(vector bool long long __a, vector bool long long __b,
+         vector unsigned char __c) {
+  return (vector bool long long)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_perm(vector double __a, vector double __b,
+         vector unsigned char __c) {
+  return (vector double)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+
+/*-- vec_permi --------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed long long
+vec_permi(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector bool long long
+vec_permi(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_permi(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_permi(X, Y, Z) ((__typeof__((vec_permi)((X), (Y), (Z)))) \
+  __builtin_s390_vpdi((vector unsigned long long)(X), \
+                      (vector unsigned long long)(Y), \
+                      (((Z) & 2) << 1) | ((Z) & 1)))
+
+/*-- vec_sel ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b,
+        vector unsigned char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sel(vector signed char __a, vector signed char __b, vector bool char __c) {
+  return ((vector signed char)__c & __b) | (~(vector signed char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector unsigned char __c) {
+  return ((vector bool char)__c & __b) | (~(vector bool char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sel(vector bool char __a, vector bool char __b, vector bool char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector unsigned char __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sel(vector unsigned char __a, vector unsigned char __b,
+        vector bool char __c) {
+  return ((vector unsigned char)__c & __b) | (~(vector unsigned char)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector unsigned short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sel(vector signed short __a, vector signed short __b,
+        vector bool short __c) {
+  return ((vector signed short)__c & __b) | (~(vector signed short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b,
+        vector unsigned short __c) {
+  return ((vector bool short)__c & __b) | (~(vector bool short)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sel(vector bool short __a, vector bool short __b, vector bool short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector unsigned short __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sel(vector unsigned short __a, vector unsigned short __b,
+        vector bool short __c) {
+  return (((vector unsigned short)__c & __b) |
+          (~(vector unsigned short)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b,
+        vector unsigned int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sel(vector signed int __a, vector signed int __b, vector bool int __c) {
+  return ((vector signed int)__c & __b) | (~(vector signed int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector unsigned int __c) {
+  return ((vector bool int)__c & __b) | (~(vector bool int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sel(vector bool int __a, vector bool int __b, vector bool int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b,
+        vector unsigned int __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sel(vector unsigned int __a, vector unsigned int __b, vector bool int __c) {
+  return ((vector unsigned int)__c & __b) | (~(vector unsigned int)__c & __a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector unsigned long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sel(vector signed long long __a, vector signed long long __b,
+        vector bool long long __c) {
+  return (((vector signed long long)__c & __b) |
+          (~(vector signed long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector unsigned long long __c) {
+  return (((vector bool long long)__c & __b) |
+          (~(vector bool long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sel(vector bool long long __a, vector bool long long __b,
+        vector bool long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector unsigned long long __c) {
+  return (__c & __b) | (~__c & __a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sel(vector unsigned long long __a, vector unsigned long long __b,
+        vector bool long long __c) {
+  return (((vector unsigned long long)__c & __b) |
+          (~(vector unsigned long long)__c & __a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
+  return (vector double)((__c & (vector unsigned long long)__b) |
+                         (~__c & (vector unsigned long long)__a));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_sel(vector double __a, vector double __b, vector bool long long __c) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  vector unsigned long long __cc = (vector unsigned long long)__c;
+  return (vector double)((__cc & __bc) | (~__cc & __ac));
+}
+
+/*-- vec_gather_element -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed int
+vec_gather_element(vector signed int __vec, vector unsigned int __offset,
+                   const signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const signed int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_gather_element(vector bool int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gather_element(vector unsigned int __vec, vector unsigned int __offset,
+                   const unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const unsigned int *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_gather_element(vector signed long long __vec,
+                   vector unsigned long long __offset,
+                   const signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const signed long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_gather_element(vector bool long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gather_element(vector unsigned long long __vec,
+                   vector unsigned long long __offset,
+                   const unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const unsigned long long *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_gather_element(vector double __vec, vector unsigned long long __offset,
+                   const double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  __vec[__index] = *(const double *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+
+/*-- vec_scatter_element ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed int __vec, vector unsigned int __offset,
+                    signed int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(signed int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned int __vec, vector unsigned int __offset,
+                    unsigned int *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(unsigned int *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector signed long long __vec,
+                    vector unsigned long long __offset,
+                    signed long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(signed long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector bool long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector unsigned long long __vec,
+                    vector unsigned long long __offset,
+                    unsigned long long *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector double __vec, vector unsigned long long __offset,
+                    double *__ptr, int __index)
+  __constant_range(__index, 0, 1) {
+  *(double *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+
+/*-- vec_xld2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xld2(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xld2(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xld2(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xld2(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xld2(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xld2(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xld2(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xld2(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_xld2(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xlw4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xlw4(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xlw4(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xlw4(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xlw4(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xlw4(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xlw4(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+/*-- vec_xstd2 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstd2(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_xstw4 --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+/*-- vec_load_bndry ---------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_load_bndry(const signed char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned char
+vec_load_bndry(const unsigned char *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed short
+vec_load_bndry(const signed short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned short
+vec_load_bndry(const unsigned short *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed int
+vec_load_bndry(const signed int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned int
+vec_load_bndry(const unsigned int *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector signed long long
+vec_load_bndry(const signed long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector unsigned long long
+vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+extern __ATTRS_o vector double
+vec_load_bndry(const double *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+
+#define vec_load_bndry(X, Y) ((__typeof__((vec_load_bndry)((X), (Y)))) \
+  __builtin_s390_vlbb((X), ((Y) == 64 ? 0 : \
+                            (Y) == 128 ? 1 : \
+                            (Y) == 256 ? 2 : \
+                            (Y) == 512 ? 3 : \
+                            (Y) == 1024 ? 4 : \
+                            (Y) == 2048 ? 5 : \
+                            (Y) == 4096 ? 6 : -1)))
+
+/*-- vec_load_len -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_load_len(const signed char *__ptr, unsigned int __len) {
+  return (vector signed char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_load_len(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_load_len(const signed short *__ptr, unsigned int __len) {
+  return (vector signed short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_load_len(const unsigned short *__ptr, unsigned int __len) {
+  return (vector unsigned short)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_load_len(const signed int *__ptr, unsigned int __len) {
+  return (vector signed int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_load_len(const unsigned int *__ptr, unsigned int __len) {
+  return (vector unsigned int)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_len(const signed long long *__ptr, unsigned int __len) {
+  return (vector signed long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
+  return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_load_len(const double *__ptr, unsigned int __len) {
+  return (vector double)__builtin_s390_vll(__len, __ptr);
+}
+
+/*-- vec_store_len ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed char __vec, signed char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned char __vec, unsigned char *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed short __vec, signed short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned short __vec, unsigned short *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed int __vec, signed int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned int __vec, unsigned int *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector signed long long __vec, signed long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+static inline __ATTRS_o_ai void
+vec_store_len(vector double __vec, double *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+
+/*-- vec_load_pair ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_load_pair(signed long long __a, signed long long __b) {
+  return (vector signed long long)(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_load_pair(unsigned long long __a, unsigned long long __b) {
+  return (vector unsigned long long)(__a, __b);
+}
+
+/*-- vec_genmask ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmask(unsigned short __mask)
+  __constant(__mask) {
+  return (vector unsigned char)(
+    __mask & 0x8000 ? 0xff : 0,
+    __mask & 0x4000 ? 0xff : 0,
+    __mask & 0x2000 ? 0xff : 0,
+    __mask & 0x1000 ? 0xff : 0,
+    __mask & 0x0800 ? 0xff : 0,
+    __mask & 0x0400 ? 0xff : 0,
+    __mask & 0x0200 ? 0xff : 0,
+    __mask & 0x0100 ? 0xff : 0,
+    __mask & 0x0080 ? 0xff : 0,
+    __mask & 0x0040 ? 0xff : 0,
+    __mask & 0x0020 ? 0xff : 0,
+    __mask & 0x0010 ? 0xff : 0,
+    __mask & 0x0008 ? 0xff : 0,
+    __mask & 0x0004 ? 0xff : 0,
+    __mask & 0x0002 ? 0xff : 0,
+    __mask & 0x0001 ? 0xff : 0);
+}
+
+/*-- vec_genmasks_* ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_genmasks_8(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 7;
+  unsigned char __bit2 = __last & 7;
+  unsigned char __mask1 = (unsigned char)(1U << (7 - __bit1) << 1) - 1;
+  unsigned char __mask2 = (unsigned char)(1U << (7 - __bit2)) - 1;
+  unsigned char __value = (__bit1 <= __bit2 ?
+                           __mask1 & ~__mask2 :
+                           __mask1 | ~__mask2);
+  return (vector unsigned char)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_genmasks_16(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 15;
+  unsigned char __bit2 = __last & 15;
+  unsigned short __mask1 = (unsigned short)(1U << (15 - __bit1) << 1) - 1;
+  unsigned short __mask2 = (unsigned short)(1U << (15 - __bit2)) - 1;
+  unsigned short __value = (__bit1 <= __bit2 ?
+                            __mask1 & ~__mask2 :
+                            __mask1 | ~__mask2);
+  return (vector unsigned short)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_genmasks_32(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 31;
+  unsigned char __bit2 = __last & 31;
+  unsigned int __mask1 = (1U << (31 - __bit1) << 1) - 1;
+  unsigned int __mask2 = (1U << (31 - __bit2)) - 1;
+  unsigned int __value = (__bit1 <= __bit2 ?
+                          __mask1 & ~__mask2 :
+                          __mask1 | ~__mask2);
+  return (vector unsigned int)__value;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_genmasks_64(unsigned char __first, unsigned char __last)
+  __constant(__first) __constant(__last) {
+  unsigned char __bit1 = __first & 63;
+  unsigned char __bit2 = __last & 63;
+  unsigned long long __mask1 = (1ULL << (63 - __bit1) << 1) - 1;
+  unsigned long long __mask2 = (1ULL << (63 - __bit2)) - 1;
+  unsigned long long __value = (__bit1 <= __bit2 ?
+                                __mask1 & ~__mask2 :
+                                __mask1 | ~__mask2);
+  return (vector unsigned long long)__value;
+}
+
+/*-- vec_splat --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splat(vector signed char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector signed char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_splat(vector bool char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector bool char)(vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splat(vector unsigned char __vec, int __index)
+  __constant_range(__index, 0, 15) {
+  return (vector unsigned char)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splat(vector signed short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector signed short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_splat(vector bool short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector bool short)(vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splat(vector unsigned short __vec, int __index)
+  __constant_range(__index, 0, 7) {
+  return (vector unsigned short)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splat(vector signed int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector signed int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_splat(vector bool int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector bool int)(vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splat(vector unsigned int __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector unsigned int)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splat(vector signed long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector signed long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_splat(vector bool long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector bool long long)(vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splat(vector unsigned long long __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector unsigned long long)__vec[__index];
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splat(vector double __vec, int __index)
+  __constant_range(__index, 0, 1) {
+  return (vector double)__vec[__index];
+}
+
+/*-- vec_splat_s* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector signed char
+vec_splat_s8(signed char __scalar)
+  __constant(__scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_ai vector signed short
+vec_splat_s16(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_ai vector signed int
+vec_splat_s32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector signed long long
+vec_splat_s64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector signed long long)(signed long)__scalar;
+}
+
+/*-- vec_splat_u* -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_splat_u8(unsigned char __scalar)
+  __constant(__scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned short
+vec_splat_u16(unsigned short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned int
+vec_splat_u32(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned int)(signed int)__scalar;
+}
+
+static inline __ATTRS_ai vector unsigned long long
+vec_splat_u64(signed short __scalar)
+  __constant(__scalar) {
+  return (vector unsigned long long)(signed long long)__scalar;
+}
+
+/*-- vec_splats -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_splats(signed char __scalar) {
+  return (vector signed char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_splats(unsigned char __scalar) {
+  return (vector unsigned char)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_splats(signed short __scalar) {
+  return (vector signed short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_splats(unsigned short __scalar) {
+  return (vector unsigned short)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_splats(signed int __scalar) {
+  return (vector signed int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_splats(unsigned int __scalar) {
+  return (vector unsigned int)__scalar;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_splats(signed long long __scalar) {
+  return (vector signed long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_splats(unsigned long long __scalar) {
+  return (vector unsigned long long)__scalar;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_splats(double __scalar) {
+  return (vector double)__scalar;
+}
+
+/*-- vec_extend_s64 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed char __a) {
+  return (vector signed long long)(__a[7], __a[15]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed short __a) {
+  return (vector signed long long)(__a[3], __a[7]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_extend_s64(vector signed int __a) {
+  return (vector signed long long)(__a[1], __a[3]);
+}
+
+/*-- vec_mergeh -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergeh(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergeh(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergeh(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3],
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergeh(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergeh(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergeh(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergeh(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergeh(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergeh(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[0], __b[0], __a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergeh(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergeh(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[0], __b[0]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergeh(vector double __a, vector double __b) {
+  return (vector double)(__a[0], __b[0]);
+}
+
+/*-- vec_mergel -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mergel(vector signed char __a, vector signed char __b) {
+  return (vector signed char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_mergel(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mergel(vector unsigned char __a, vector unsigned char __b) {
+  return (vector unsigned char)(
+    __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], __a[11], __b[11],
+    __a[12], __b[12], __a[13], __b[13], __a[14], __b[14], __a[15], __b[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mergel(vector signed short __a, vector signed short __b) {
+  return (vector signed short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_mergel(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mergel(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)(
+    __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], __a[7], __b[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mergel(vector signed int __a, vector signed int __b) {
+  return (vector signed int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_mergel(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mergel(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)(__a[2], __b[2], __a[3], __b[3]);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mergel(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_mergel(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)(__a[1], __b[1]);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_mergel(vector double __a, vector double __b) {
+  return (vector double)(__a[1], __b[1]);
+}
+
+/*-- vec_pack ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_pack(vector signed short __a, vector signed short __b) {
+  vector signed char __ac = (vector signed char)__a;
+  vector signed char __bc = (vector signed char)__b;
+  return (vector signed char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_pack(vector bool short __a, vector bool short __b) {
+  vector bool char __ac = (vector bool char)__a;
+  vector bool char __bc = (vector bool char)__b;
+  return (vector bool char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_pack(vector unsigned short __a, vector unsigned short __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return (vector unsigned char)(
+    __ac[1], __ac[3], __ac[5], __ac[7], __ac[9], __ac[11], __ac[13], __ac[15],
+    __bc[1], __bc[3], __bc[5], __bc[7], __bc[9], __bc[11], __bc[13], __bc[15]);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_pack(vector signed int __a, vector signed int __b) {
+  vector signed short __ac = (vector signed short)__a;
+  vector signed short __bc = (vector signed short)__b;
+  return (vector signed short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_pack(vector bool int __a, vector bool int __b) {
+  vector bool short __ac = (vector bool short)__a;
+  vector bool short __bc = (vector bool short)__b;
+  return (vector bool short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_pack(vector unsigned int __a, vector unsigned int __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return (vector unsigned short)(
+    __ac[1], __ac[3], __ac[5], __ac[7],
+    __bc[1], __bc[3], __bc[5], __bc[7]);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_pack(vector signed long long __a, vector signed long long __b) {
+  vector signed int __ac = (vector signed int)__a;
+  vector signed int __bc = (vector signed int)__b;
+  return (vector signed int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_pack(vector bool long long __a, vector bool long long __b) {
+  vector bool int __ac = (vector bool int)__a;
+  vector bool int __bc = (vector bool int)__b;
+  return (vector bool int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_pack(vector unsigned long long __a, vector unsigned long long __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return (vector unsigned int)(__ac[1], __ac[3], __bc[1], __bc[3]);
+}
+
+/*-- vec_packs --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vpksh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vpksf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vpksg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packs_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_packs_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return __builtin_s390_vpkshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packs_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_packs_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return __builtin_s390_vpksfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packs_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_packs_cc(vector signed long long __a, vector signed long long __b,
+             int *__cc) {
+  return __builtin_s390_vpksgs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packs_cc(vector unsigned long long __a, vector unsigned long long __b,
+             int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_packsu -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector signed short __a, vector signed short __b) {
+  const vector signed short __zero = (vector signed short)0;
+  return __builtin_s390_vpklsh(
+    (vector unsigned short)(__a >= __zero) & (vector unsigned short)__a,
+    (vector unsigned short)(__b >= __zero) & (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vpklsh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector signed int __a, vector signed int __b) {
+  const vector signed int __zero = (vector signed int)0;
+  return __builtin_s390_vpklsf(
+    (vector unsigned int)(__a >= __zero) & (vector unsigned int)__a,
+    (vector unsigned int)(__b >= __zero) & (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vpklsf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector signed long long __a, vector signed long long __b) {
+  const vector signed long long __zero = (vector signed long long)0;
+  return __builtin_s390_vpklsg(
+    (vector unsigned long long)(__a >= __zero) &
+    (vector unsigned long long)__a,
+    (vector unsigned long long)(__b >= __zero) &
+    (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vpklsg(__a, __b);
+}
+
+/*-- vec_packsu_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_packsu_cc(vector unsigned short __a, vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vpklshs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_packsu_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vpklsfs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_packsu_cc(vector unsigned long long __a, vector unsigned long long __b,
+              int *__cc) {
+  return __builtin_s390_vpklsgs(__a, __b, __cc);
+}
+
+/*-- vec_unpackh ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackh(vector signed char __a) {
+  return __builtin_s390_vuphb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackh(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuphb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackh(vector unsigned char __a) {
+  return __builtin_s390_vuplhb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackh(vector signed short __a) {
+  return __builtin_s390_vuphh(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackh(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuphh((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackh(vector unsigned short __a) {
+  return __builtin_s390_vuplhh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackh(vector signed int __a) {
+  return __builtin_s390_vuphf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackh(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuphf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackh(vector unsigned int __a) {
+  return __builtin_s390_vuplhf(__a);
+}
+
+/*-- vec_unpackl ------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_unpackl(vector signed char __a) {
+  return __builtin_s390_vuplb(__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_unpackl(vector bool char __a) {
+  return (vector bool short)__builtin_s390_vuplb((vector signed char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_unpackl(vector unsigned char __a) {
+  return __builtin_s390_vupllb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_unpackl(vector signed short __a) {
+  return __builtin_s390_vuplhw(__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_unpackl(vector bool short __a) {
+  return (vector bool int)__builtin_s390_vuplhw((vector signed short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_unpackl(vector unsigned short __a) {
+  return __builtin_s390_vupllh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_unpackl(vector signed int __a) {
+  return __builtin_s390_vuplf(__a);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_unpackl(vector bool int __a) {
+  return (vector bool long long)__builtin_s390_vuplf((vector signed int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unpackl(vector unsigned int __a) {
+  return __builtin_s390_vupllf(__a);
+}
+
+/*-- vec_cmpeq --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpeq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpeq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpeq(vector double __a, vector double __b) {
+  return (vector bool long long)(__a == __b);
+}
+
+/*-- vec_cmpge --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpge(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpge(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpge(vector double __a, vector double __b) {
+  return (vector bool long long)(__a >= __b);
+}
+
+/*-- vec_cmpgt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpgt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpgt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmpgt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a > __b);
+}
+
+/*-- vec_cmple --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmple(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmple(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmple(vector double __a, vector double __b) {
+  return (vector bool long long)(__a <= __b);
+}
+
+/*-- vec_cmplt --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector signed char __a, vector signed char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmplt(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector signed short __a, vector signed short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmplt(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector signed int __a, vector signed int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_cmplt(vector double __a, vector double __b) {
+  return (vector bool long long)(__a < __b);
+}
+
+/*-- vec_all_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc == 3;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc == 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_all_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_all_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 0;
+}
+
+/*-- vec_all_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_all_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc == 3;
+}
+
+/*-- vec_any_eq -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_eq(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_ne -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vceqbs((vector signed char)__a,
+                        (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vceqhs((vector signed short)__a,
+                        (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vceqfs((vector signed int)__a,
+                        (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vceqgs((vector signed long long)__a,
+                        (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ne(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfcedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ge -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_ge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_gt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_gt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_le -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__a, (vector signed char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__a, (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__a,
+                        (vector unsigned char)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__a, (vector signed short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__a, (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__a,
+                        (vector unsigned short)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__a, (vector signed int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__a, (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__a,
+                        (vector unsigned int)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__a, (vector signed long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__a, (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a, __b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__a,
+                        (vector unsigned long long)__b, &__cc);
+  return __cc != 0;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_le(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_lt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchbs((vector signed char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector signed char __b) {
+  int __cc;
+  __builtin_s390_vchbs(__b, (vector signed char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector unsigned char __b) {
+  int __cc;
+  __builtin_s390_vchlbs(__b, (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool char __a, vector bool char __b) {
+  int __cc;
+  __builtin_s390_vchlbs((vector unsigned char)__b,
+                        (vector unsigned char)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchhs((vector signed short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector signed short __b) {
+  int __cc;
+  __builtin_s390_vchhs(__b, (vector signed short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector unsigned short __b) {
+  int __cc;
+  __builtin_s390_vchlhs(__b, (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool short __a, vector bool short __b) {
+  int __cc;
+  __builtin_s390_vchlhs((vector unsigned short)__b,
+                        (vector unsigned short)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchfs((vector signed int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector signed int __b) {
+  int __cc;
+  __builtin_s390_vchfs(__b, (vector signed int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector unsigned int __b) {
+  int __cc;
+  __builtin_s390_vchlfs(__b, (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool int __a, vector bool int __b) {
+  int __cc;
+  __builtin_s390_vchlfs((vector unsigned int)__b,
+                        (vector unsigned int)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector signed long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchgs((vector signed long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector signed long long __b) {
+  int __cc;
+  __builtin_s390_vchgs(__b, (vector signed long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs(__b, (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector bool long long __a, vector bool long long __b) {
+  int __cc;
+  __builtin_s390_vchlgs((vector unsigned long long)__b,
+                        (vector unsigned long long)__a, &__cc);
+  return __cc <= 1;
+}
+
+static inline __ATTRS_o_ai int
+vec_any_lt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+
+/*-- vec_any_nge ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nge(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_ngt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_ngt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nle ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nle(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchedbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nlt ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nlt(vector double __a, vector double __b) {
+  int __cc;
+  __builtin_s390_vfchdbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_any_nan ------------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_nan(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 3;
+}
+
+/*-- vec_any_numeric --------------------------------------------------------*/
+
+static inline __ATTRS_ai int
+vec_any_numeric(vector double __a) {
+  int __cc;
+  __builtin_s390_vftcidb(__a, 15, &__cc);
+  return __cc != 0;
+}
+
+/*-- vec_andc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_andc(vector bool char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector bool char __a, vector signed char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_andc(vector signed char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector bool char __a, vector unsigned char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_andc(vector unsigned char __a, vector bool char __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_andc(vector bool short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector bool short __a, vector signed short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_andc(vector signed short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector bool short __a, vector unsigned short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_andc(vector unsigned short __a, vector bool short __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_andc(vector bool int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector bool int __a, vector signed int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_andc(vector signed int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector bool int __a, vector unsigned int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_andc(vector unsigned int __a, vector bool int __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_andc(vector bool long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector bool long long __a, vector signed long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_andc(vector signed long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector bool long long __a, vector unsigned long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_andc(vector unsigned long long __a, vector bool long long __b) {
+  return __a & ~__b;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector bool long long __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_andc(vector double __a, vector bool long long __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+
+/*-- vec_nor ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_nor(vector bool char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector bool char __a, vector signed char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nor(vector signed char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector bool char __a, vector unsigned char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nor(vector unsigned char __a, vector bool char __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nor(vector bool short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector bool short __a, vector signed short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nor(vector signed short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector bool short __a, vector unsigned short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nor(vector unsigned short __a, vector bool short __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nor(vector bool int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector bool int __a, vector signed int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nor(vector signed int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector bool int __a, vector unsigned int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nor(vector unsigned int __a, vector bool int __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nor(vector bool long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector bool long long __a, vector signed long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nor(vector signed long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector bool long long __a, vector unsigned long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nor(vector unsigned long long __a, vector bool long long __b) {
+  return ~(__a | __b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector bool long long __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nor(vector double __a, vector bool long long __b) {
+  return (vector double)~((vector unsigned long long)__a |
+                          (vector unsigned long long)__b);
+}
+
+/*-- vec_cntlz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector signed char __a) {
+  return __builtin_s390_vclzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cntlz(vector unsigned char __a) {
+  return __builtin_s390_vclzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector signed short __a) {
+  return __builtin_s390_vclzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cntlz(vector unsigned short __a) {
+  return __builtin_s390_vclzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector signed int __a) {
+  return __builtin_s390_vclzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cntlz(vector unsigned int __a) {
+  return __builtin_s390_vclzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector signed long long __a) {
+  return __builtin_s390_vclzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cntlz(vector unsigned long long __a) {
+  return __builtin_s390_vclzg(__a);
+}
+
+/*-- vec_cnttz --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector signed char __a) {
+  return __builtin_s390_vctzb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cnttz(vector unsigned char __a) {
+  return __builtin_s390_vctzb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector signed short __a) {
+  return __builtin_s390_vctzh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cnttz(vector unsigned short __a) {
+  return __builtin_s390_vctzh(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector signed int __a) {
+  return __builtin_s390_vctzf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cnttz(vector unsigned int __a) {
+  return __builtin_s390_vctzf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector signed long long __a) {
+  return __builtin_s390_vctzg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_cnttz(vector unsigned long long __a) {
+  return __builtin_s390_vctzg(__a);
+}
+
+/*-- vec_popcnt -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector signed char __a) {
+  return __builtin_s390_vpopctb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_popcnt(vector unsigned char __a) {
+  return __builtin_s390_vpopctb(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector signed short __a) {
+  return __builtin_s390_vpopcth((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_popcnt(vector unsigned short __a) {
+  return __builtin_s390_vpopcth(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector signed int __a) {
+  return __builtin_s390_vpopctf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_popcnt(vector unsigned int __a) {
+  return __builtin_s390_vpopctf(__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector signed long long __a) {
+  return __builtin_s390_vpopctg((vector unsigned long long)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_popcnt(vector unsigned long long __a) {
+  return __builtin_s390_vpopctg(__a);
+}
+
+/*-- vec_rl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_verllvb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_verllvb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_verllvh(
+    (vector unsigned short)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rl(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_verllvh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_verllvf(
+    (vector unsigned int)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rl(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_verllvf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rl(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_verllvg(
+    (vector unsigned long long)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rl(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_verllvg(__a, __b);
+}
+
+/*-- vec_rli ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_rli(vector signed char __a, unsigned long __b) {
+  return (vector signed char)__builtin_s390_verllb(
+    (vector unsigned char)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_rli(vector unsigned char __a, unsigned long __b) {
+  return __builtin_s390_verllb(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_rli(vector signed short __a, unsigned long __b) {
+  return (vector signed short)__builtin_s390_verllh(
+    (vector unsigned short)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_rli(vector unsigned short __a, unsigned long __b) {
+  return __builtin_s390_verllh(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_rli(vector signed int __a, unsigned long __b) {
+  return (vector signed int)__builtin_s390_verllf(
+    (vector unsigned int)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_rli(vector unsigned int __a, unsigned long __b) {
+  return __builtin_s390_verllf(__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_rli(vector signed long long __a, unsigned long __b) {
+  return (vector signed long long)__builtin_s390_verllg(
+    (vector unsigned long long)__a, (int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_rli(vector unsigned long long __a, unsigned long __b) {
+  return __builtin_s390_verllg(__a, (int)__b);
+}
+
+/*-- vec_rl_mask ------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_rl_mask(vector signed char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned char
+vec_rl_mask(vector unsigned char __a, vector unsigned char __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed short
+vec_rl_mask(vector signed short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned short
+vec_rl_mask(vector unsigned short __a, vector unsigned short __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed int
+vec_rl_mask(vector signed int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned int
+vec_rl_mask(vector unsigned int __a, vector unsigned int __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector signed long long
+vec_rl_mask(vector signed long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+extern __ATTRS_o vector unsigned long long
+vec_rl_mask(vector unsigned long long __a, vector unsigned long long __b,
+            unsigned char __c) __constant(__c);
+
+#define vec_rl_mask(X, Y, Z) ((__typeof__((vec_rl_mask)((X), (Y), (Z)))) \
+  __extension__ ({ \
+    vector unsigned char __res; \
+    vector unsigned char __x = (vector unsigned char)(X); \
+    vector unsigned char __y = (vector unsigned char)(Y); \
+    switch (sizeof ((X)[0])) { \
+    case 1: __res = (vector unsigned char) __builtin_s390_verimb( \
+             (vector unsigned char)__x, (vector unsigned char)__x, \
+             (vector unsigned char)__y, (Z)); break; \
+    case 2: __res = (vector unsigned char) __builtin_s390_verimh( \
+             (vector unsigned short)__x, (vector unsigned short)__x, \
+             (vector unsigned short)__y, (Z)); break; \
+    case 4: __res = (vector unsigned char) __builtin_s390_verimf( \
+             (vector unsigned int)__x, (vector unsigned int)__x, \
+             (vector unsigned int)__y, (Z)); break; \
+    default: __res = (vector unsigned char) __builtin_s390_verimg( \
+             (vector unsigned long long)__x, (vector unsigned long long)__x, \
+             (vector unsigned long long)__y, (Z)); break; \
+    } __res; }))
+
+/*-- vec_sll ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sll(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sll(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sll(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sll(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sll(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sll(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sll(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sll(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sll(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sll(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sll(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sll(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_slb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_slb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vslb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vslb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_slb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vslb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_slb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_slb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_slb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_slb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_slb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_slb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_slb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_sld ----------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sld(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned char
+vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed short
+vec_sld(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned short
+vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed int
+vec_sld(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned int
+vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector signed long long
+vec_sld(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector unsigned long long
+vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
+extern __ATTRS_o vector double
+vec_sld(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 15);
+
+#define vec_sld(X, Y, Z) ((__typeof__((vec_sld)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z)))
+
+/*-- vec_sldw ---------------------------------------------------------------*/
+
+extern __ATTRS_o vector signed char
+vec_sldw(vector signed char __a, vector signed char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned char
+vec_sldw(vector unsigned char __a, vector unsigned char __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed short
+vec_sldw(vector signed short __a, vector signed short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned short
+vec_sldw(vector unsigned short __a, vector unsigned short __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed int
+vec_sldw(vector signed int __a, vector signed int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned int
+vec_sldw(vector unsigned int __a, vector unsigned int __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector signed long long
+vec_sldw(vector signed long long __a, vector signed long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector unsigned long long
+vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
+  __constant_range(__c, 0, 3);
+
+extern __ATTRS_o vector double
+vec_sldw(vector double __a, vector double __b, int __c)
+  __constant_range(__c, 0, 3);
+
+#define vec_sldw(X, Y, Z) ((__typeof__((vec_sldw)((X), (Y), (Z)))) \
+  __builtin_s390_vsldb((vector unsigned char)(X), \
+                       (vector unsigned char)(Y), (Z) * 4))
+
+/*-- vec_sral ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_sral(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_sral(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsra(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sral(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsra(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_sral(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_sral(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_sral(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_sral(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_sral(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sral(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_sral(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_sral(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sral(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsra(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srab ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srab(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrab(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrab(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srab(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrab(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srab(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srab(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srab(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srab(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srab(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srab(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srab(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srl ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned short __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srl(vector signed char __a, vector unsigned int __b) {
+  return (vector signed char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned short __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_srl(vector bool char __a, vector unsigned int __b) {
+  return (vector bool char)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrl(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned short __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srl(vector unsigned char __a, vector unsigned int __b) {
+  return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned char __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srl(vector signed short __a, vector unsigned int __b) {
+  return (vector signed short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned char __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_srl(vector bool short __a, vector unsigned int __b) {
+  return (vector bool short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned char __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srl(vector unsigned short __a, vector unsigned int __b) {
+  return (vector unsigned short)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned char __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned short __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srl(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned char __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned short __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_srl(vector bool int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned char __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned short __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srl(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned char __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned short __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srl(vector signed long long __a, vector unsigned int __b) {
+  return (vector signed long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned char __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned short __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_srl(vector bool long long __a, vector unsigned int __b) {
+  return (vector bool long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned char __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned short __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srl(vector unsigned long long __a, vector unsigned int __b) {
+  return (vector unsigned long long)__builtin_s390_vsrl(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_srb ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector signed char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_srb(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector signed char __b) {
+  return __builtin_s390_vsrlb(__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_srb(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsrlb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector signed short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_srb(vector signed short __a, vector unsigned short __b) {
+  return (vector signed short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector signed short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_srb(vector unsigned short __a, vector unsigned short __b) {
+  return (vector unsigned short)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector signed int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_srb(vector signed int __a, vector unsigned int __b) {
+  return (vector signed int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector signed int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_srb(vector unsigned int __a, vector unsigned int __b) {
+  return (vector unsigned int)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector signed long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_srb(vector signed long long __a, vector unsigned long long __b) {
+  return (vector signed long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector signed long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_srb(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector unsigned long long)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector signed long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_srb(vector double __a, vector unsigned long long __b) {
+  return (vector double)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+/*-- vec_abs ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_abs(vector signed char __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed char)0));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_abs(vector signed short __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed short)0));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_abs(vector signed int __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed int)0));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_abs(vector signed long long __a) {
+  return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_abs(vector double __a) {
+  return __builtin_s390_vflpdb(__a);
+}
+
+/*-- vec_nabs ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_nabs(vector double __a) {
+  return __builtin_s390_vflndb(__a);
+}
+
+/*-- vec_max ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector signed char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_max(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_max(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector signed short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_max(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_max(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector signed int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_max(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_max(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_max(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_max(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_max(vector double __a, vector double __b) {
+  return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_min ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector signed char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector signed char __a, vector bool char __b) {
+  vector signed char __bc = (vector signed char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_min(vector bool char __a, vector signed char __b) {
+  vector signed char __ac = (vector signed char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector unsigned char __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector unsigned char __a, vector bool char __b) {
+  vector unsigned char __bc = (vector unsigned char)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_min(vector bool char __a, vector unsigned char __b) {
+  vector unsigned char __ac = (vector unsigned char)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector signed short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector signed short __a, vector bool short __b) {
+  vector signed short __bc = (vector signed short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_min(vector bool short __a, vector signed short __b) {
+  vector signed short __ac = (vector signed short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector unsigned short __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector unsigned short __a, vector bool short __b) {
+  vector unsigned short __bc = (vector unsigned short)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_min(vector bool short __a, vector unsigned short __b) {
+  vector unsigned short __ac = (vector unsigned short)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector signed int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector signed int __a, vector bool int __b) {
+  vector signed int __bc = (vector signed int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_min(vector bool int __a, vector signed int __b) {
+  vector signed int __ac = (vector signed int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector unsigned int __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector unsigned int __a, vector bool int __b) {
+  vector unsigned int __bc = (vector unsigned int)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_min(vector bool int __a, vector unsigned int __b) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector signed long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector signed long long __a, vector bool long long __b) {
+  vector signed long long __bc = (vector signed long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_min(vector bool long long __a, vector signed long long __b) {
+  vector signed long long __ac = (vector signed long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector unsigned long long __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector unsigned long long __a, vector bool long long __b) {
+  vector unsigned long long __bc = (vector unsigned long long)__b;
+  return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_min(vector bool long long __a, vector unsigned long long __b) {
+  vector unsigned long long __ac = (vector unsigned long long)__a;
+  return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
+}
+
+static inline __ATTRS_o_ai vector double
+vec_min(vector double __a, vector double __b) {
+  return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+}
+
+/*-- vec_add_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_add_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaq(__a, __b);
+}
+
+/*-- vec_addc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_addc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_addc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vacch(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_addc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vaccf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_addc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vaccg(__a, __b);
+}
+
+/*-- vec_addc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vaccq(__a, __b);
+}
+
+/*-- vec_adde_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_adde_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vacq(__a, __b, __c);
+}
+
+/*-- vec_addec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_addec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vacccq(__a, __b, __c);
+}
+
+/*-- vec_avg ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_avg(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vavgb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_avg(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vavgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_avg(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vavgf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_avg(vector signed long long __a, vector signed long long __b) {
+  return __builtin_s390_vavgg(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_avg(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vavglb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_avg(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vavglh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_avg(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vavglf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_avg(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vavglg(__a, __b);
+}
+
+/*-- vec_checksum -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned int
+vec_checksum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vcksm(__a, __b);
+}
+
+/*-- vec_gfmsum -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vgfmb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vgfmh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vgfmf(__a, __b);
+}
+
+/*-- vec_gfmsum_128 ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vgfmg(__a, __b);
+}
+
+/*-- vec_gfmsum_accum -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_gfmsum_accum(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned short __c) {
+  return __builtin_s390_vgfmab(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_gfmsum_accum(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned int __c) {
+  return __builtin_s390_vgfmah(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_gfmsum_accum(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned long long __c) {
+  return __builtin_s390_vgfmaf(__a, __b, __c);
+}
+
+/*-- vec_gfmsum_accum_128 ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_gfmsum_accum_128(vector unsigned long long __a,
+                     vector unsigned long long __b,
+                     vector unsigned char __c) {
+  return __builtin_s390_vgfmag(__a, __b, __c);
+}
+
+/*-- vec_mladd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector unsigned char __a, vector signed char __b,
+          vector signed char __c) {
+  return (vector signed char)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_mladd(vector signed char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * (vector signed char)__b + (vector signed char)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mladd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector unsigned short __a, vector signed short __b,
+          vector signed short __c) {
+  return (vector signed short)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mladd(vector signed short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * (vector signed short)__b + (vector signed short)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mladd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector unsigned int __a, vector signed int __b,
+          vector signed int __c) {
+  return (vector signed int)__a * __b + __c;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mladd(vector signed int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * (vector signed int)__b + (vector signed int)__c;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mladd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __a * __b + __c;
+}
+
+/*-- vec_mhadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mhadd(vector signed char __a, vector signed char __b,
+          vector signed char __c) {
+  return __builtin_s390_vmahb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mhadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return __builtin_s390_vmalhb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mhadd(vector signed short __a, vector signed short __b,
+          vector signed short __c) {
+  return __builtin_s390_vmahh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mhadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalhh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mhadd(vector signed int __a, vector signed int __b,
+          vector signed int __c) {
+  return __builtin_s390_vmahf(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mhadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmalhf(__a, __b, __c);
+}
+
+/*-- vec_meadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_meadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaeb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_meadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmaleb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_meadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaeh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_meadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaleh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_meadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaef(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_meadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalef(__a, __b, __c);
+}
+
+/*-- vec_moadd --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_moadd(vector signed char __a, vector signed char __b,
+          vector signed short __c) {
+  return __builtin_s390_vmaob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_moadd(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned short __c) {
+  return __builtin_s390_vmalob(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_moadd(vector signed short __a, vector signed short __b,
+          vector signed int __c) {
+  return __builtin_s390_vmaoh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_moadd(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned int __c) {
+  return __builtin_s390_vmaloh(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_moadd(vector signed int __a, vector signed int __b,
+          vector signed long long __c) {
+  return __builtin_s390_vmaof(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_moadd(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned long long __c) {
+  return __builtin_s390_vmalof(__a, __b, __c);
+}
+
+/*-- vec_mulh ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_mulh(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_mulh(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlhb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulh(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulh(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmlhh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulh(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmhf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulh(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlhf(__a, __b);
+}
+
+/*-- vec_mule ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mule(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mule(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmleb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mule(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mule(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmleh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mule(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmef(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mule(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlef(__a, __b);
+}
+
+/*-- vec_mulo ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed short
+vec_mulo(vector signed char __a, vector signed char __b) {
+  return __builtin_s390_vmob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_mulo(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vmlob(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_mulo(vector signed short __a, vector signed short __b) {
+  return __builtin_s390_vmoh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_mulo(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vmloh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_mulo(vector signed int __a, vector signed int __b) {
+  return __builtin_s390_vmof(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_mulo(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vmlof(__a, __b);
+}
+
+/*-- vec_sub_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sub_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsq(__a, __b);
+}
+
+/*-- vec_subc ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_subc(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbib(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_subc(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vscbih(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_subc(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vscbif(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_subc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vscbig(__a, __b);
+}
+
+/*-- vec_subc_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subc_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vscbiq(__a, __b);
+}
+
+/*-- vec_sube_u128 ----------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_sube_u128(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vsbiq(__a, __b, __c);
+}
+
+/*-- vec_subec_u128 ---------------------------------------------------------*/
+
+static inline __ATTRS_ai vector unsigned char
+vec_subec_u128(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vsbcbiq(__a, __b, __c);
+}
+
+/*-- vec_sum2 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumgh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_sum2(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumgf(__a, __b);
+}
+
+/*-- vec_sum_u128 -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vsumqf(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_sum_u128(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vsumqg(__a, __b);
+}
+
+/*-- vec_sum4 ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vsumb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_sum4(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vsumh(__a, __b);
+}
+
+/*-- vec_test_mask ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vtm(__a, __b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector signed long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai int
+vec_test_mask(vector double __a, vector unsigned long long __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+
+/*-- vec_madd ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_madd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmadb(__a, __b, __c);
+}
+
+/*-- vec_msub ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_msub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfmsdb(__a, __b, __c);
+}
+
+/*-- vec_sqrt ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_sqrt(vector double __a) {
+  return __builtin_s390_vfsqdb(__a);
+}
+
+/*-- vec_ld2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ld2f(const float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  return __builtin_convertvector(*(const __v2f32 *)__ptr, vector double);
+}
+
+/*-- vec_st2f ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai void
+vec_st2f(vector double __a, float *__ptr) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  *(__v2f32 *)__ptr = __builtin_convertvector(__a, __v2f32);
+}
+
+/*-- vec_ctd ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector signed long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+static inline __ATTRS_o_ai vector double
+vec_ctd(vector unsigned long long __a, int __b)
+  __constant_range(__b, 0, 31) {
+  vector double __conv = __builtin_convertvector(__a, vector double);
+  __conv *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
+  return __conv;
+}
+
+/*-- vec_ctsl ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_ctsl(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_ctul ---------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_ctul(vector double __a, int __b)
+  __constant_range(__b, 0, 31) {
+  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+/*-- vec_roundp -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundp(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_ceil ---------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_ceil(vector double __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 6);
+}
+
+/*-- vec_roundm -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundm(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_floor --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_floor(vector double __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 7);
+}
+
+/*-- vec_roundz -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundz(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_trunc --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_trunc(vector double __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 4, 5);
+}
+
+/*-- vec_roundc -------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_roundc(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 0);
+}
+
+/*-- vec_round --------------------------------------------------------------*/
+
+static inline __ATTRS_ai vector double
+vec_round(vector double __a) {
+  return __builtin_s390_vfidb(__a, 4, 4);
+}
+
+/*-- vec_fp_test_data_class -------------------------------------------------*/
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+
+/*-- vec_cp_until_zero ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero(vector signed char __a) {
+  return (vector signed char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero(vector bool char __a) {
+  return (vector bool char)__builtin_s390_vistrb((vector unsigned char)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero(vector unsigned char __a) {
+  return __builtin_s390_vistrb(__a);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero(vector signed short __a) {
+  return (vector signed short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero(vector bool short __a) {
+  return (vector bool short)__builtin_s390_vistrh((vector unsigned short)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero(vector unsigned short __a) {
+  return __builtin_s390_vistrh(__a);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero(vector signed int __a) {
+  return (vector signed int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero(vector bool int __a) {
+  return (vector bool int)__builtin_s390_vistrf((vector unsigned int)__a);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero(vector unsigned int __a) {
+  return __builtin_s390_vistrf(__a);
+}
+
+/*-- vec_cp_until_zero_cc ---------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cp_until_zero_cc(vector signed char __a, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_cp_until_zero_cc(vector bool char __a, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vistrbs((vector unsigned char)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cp_until_zero_cc(vector unsigned char __a, int *__cc) {
+  return __builtin_s390_vistrbs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cp_until_zero_cc(vector signed short __a, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cp_until_zero_cc(vector bool short __a, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vistrhs((vector unsigned short)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cp_until_zero_cc(vector unsigned short __a, int *__cc) {
+  return __builtin_s390_vistrhs(__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cp_until_zero_cc(vector signed int __a, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vistrfs((vector unsigned int)__a, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cp_until_zero_cc(vector bool int __a, int *__cc) {
+  return (vector bool int)__builtin_s390_vistrfs((vector unsigned int)__a,
+                                                 __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cp_until_zero_cc(vector unsigned int __a, int *__cc) {
+  return __builtin_s390_vistrfs(__a, __cc);
+}
+
+/*-- vec_cmpeq_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeeb((vector unsigned char)__a,
+                         (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeeb((vector unsigned char)__a,
+                              (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeeb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeeh((vector unsigned short)__a,
+                         (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeeh((vector unsigned short)__a,
+                              (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeeh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeef((vector unsigned int)__a,
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeef((vector unsigned int)__a,
+                              (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeef(__a, __b);
+}
+
+/*-- vec_cmpeq_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfeebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfeehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfeefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpeq_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeezb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeezb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeezh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeezh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfeezf((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfeezf((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfeezf(__a, __b);
+}
+
+/*-- vec_cmpeq_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpeq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfeezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfeezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpeq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpeq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfeezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfeezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpeq_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpeq_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfeezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfeezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpeq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfeezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfeneb((vector unsigned char)__a,
+                          (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfeneb((vector unsigned char)__a,
+                               (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfeneb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfeneh((vector unsigned short)__a,
+                          (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfeneh((vector unsigned short)__a,
+                               (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfeneh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenef((vector unsigned int)__a,
+                          (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenef((vector unsigned int)__a,
+                               (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenef(__a, __b);
+}
+
+/*-- vec_cmpne_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_idx_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenebs((vector unsigned char)__a,
+                           (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenebs((vector unsigned char)__a,
+                                (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 int *__cc) {
+  return __builtin_s390_vfenebs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_idx_cc(vector signed short __a, vector signed short __b, int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenehs((vector unsigned short)__a,
+                           (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenehs((vector unsigned short)__a,
+                                (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 int *__cc) {
+  return __builtin_s390_vfenehs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenefs((vector unsigned int)__a,
+                           (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenefs((vector unsigned int)__a,
+                                (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_idx_cc(vector unsigned int __a, vector unsigned int __b, int *__cc) {
+  return __builtin_s390_vfenefs(__a, __b, __cc);
+}
+
+/*-- vec_cmpne_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfenezb((vector unsigned char)__a,
+                           (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfenezb((vector unsigned char)__a,
+                                (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfenezb(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfenezh((vector unsigned short)__a,
+                           (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfenezh((vector unsigned short)__a,
+                                (vector unsigned short)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfenezh(__a, __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfenezf((vector unsigned int)__a,
+                           (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfenezf((vector unsigned int)__a,
+                                (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfenezf(__a, __b);
+}
+
+/*-- vec_cmpne_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_cmpne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                      int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfenezbs((vector unsigned char)__a,
+                            (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfenezbs((vector unsigned char)__a,
+                                 (vector unsigned char)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezbs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_cmpne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                      int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfenezhs((vector unsigned short)__a,
+                            (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return __builtin_s390_vfenezhs((vector unsigned short)__a,
+                                 (vector unsigned short)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpne_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezhs(__a, __b, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_cmpne_or_0_idx_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfenezfs((vector unsigned int)__a,
+                            (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfenezfs((vector unsigned int)__a,
+                                 (vector unsigned int)__b, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      int *__cc) {
+  return __builtin_s390_vfenezfs(__a, __b, __cc);
+}
+
+/*-- vec_cmprg --------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg(vector unsigned char __a, vector unsigned char __b,
+          vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg(vector unsigned short __a, vector unsigned short __b,
+          vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg(vector unsigned int __a, vector unsigned int __b,
+          vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 4);
+}
+
+/*-- vec_cmprg_cc -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmprg_cc(vector unsigned char __a, vector unsigned char __b,
+             vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmprg_cc(vector unsigned short __a, vector unsigned short __b,
+             vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmprg_cc(vector unsigned int __a, vector unsigned int __b,
+             vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 4, __cc);
+}
+
+/*-- vec_cmprg_idx ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_idx_cc -------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                 vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                 vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                 vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmprg_or_0_idx -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                   vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                   vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                   vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 0);
+}
+
+/*-- vec_cmprg_or_0_idx_cc --------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmprg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                      vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmprg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                      vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmprg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                      vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 0, __cc);
+}
+
+/*-- vec_cmpnrg -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg(vector unsigned char __a, vector unsigned char __b,
+           vector unsigned char __c) {
+  return (vector bool char)__builtin_s390_vstrcb(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg(vector unsigned short __a, vector unsigned short __b,
+           vector unsigned short __c) {
+  return (vector bool short)__builtin_s390_vstrch(__a, __b, __c, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg(vector unsigned int __a, vector unsigned int __b,
+           vector unsigned int __c) {
+  return (vector bool int)__builtin_s390_vstrcf(__a, __b, __c, 12);
+}
+
+/*-- vec_cmpnrg_cc ----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_cmpnrg_cc(vector unsigned char __a, vector unsigned char __b,
+              vector unsigned char __c, int *__cc) {
+  return (vector bool char)__builtin_s390_vstrcbs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_cmpnrg_cc(vector unsigned short __a, vector unsigned short __b,
+              vector unsigned short __c, int *__cc) {
+  return (vector bool short)__builtin_s390_vstrchs(__a, __b, __c, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_cmpnrg_cc(vector unsigned int __a, vector unsigned int __b,
+              vector unsigned int __c, int *__cc) {
+  return (vector bool int)__builtin_s390_vstrcfs(__a, __b, __c, 12, __cc);
+}
+
+/*-- vec_cmpnrg_idx ---------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx(vector unsigned char __a, vector unsigned char __b,
+               vector unsigned char __c) {
+  return __builtin_s390_vstrcb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx(vector unsigned short __a, vector unsigned short __b,
+               vector unsigned short __c) {
+  return __builtin_s390_vstrch(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx(vector unsigned int __a, vector unsigned int __b,
+               vector unsigned int __c) {
+  return __builtin_s390_vstrcf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_idx_cc ------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                  vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrcbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                  vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrchs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                  vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrcfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_cmpnrg_or_0_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx(vector unsigned char __a, vector unsigned char __b,
+                    vector unsigned char __c) {
+  return __builtin_s390_vstrczb(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx(vector unsigned short __a, vector unsigned short __b,
+                    vector unsigned short __c) {
+  return __builtin_s390_vstrczh(__a, __b, __c, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx(vector unsigned int __a, vector unsigned int __b,
+                    vector unsigned int __c) {
+  return __builtin_s390_vstrczf(__a, __b, __c, 8);
+}
+
+/*-- vec_cmpnrg_or_0_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_cmpnrg_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       vector unsigned char __c, int *__cc) {
+  return __builtin_s390_vstrczbs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_cmpnrg_or_0_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       vector unsigned short __c, int *__cc) {
+  return __builtin_s390_vstrczhs(__a, __b, __c, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_cmpnrg_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       vector unsigned int __c, int *__cc) {
+  return __builtin_s390_vstrczfs(__a, __b, __c, 8, __cc);
+}
+
+/*-- vec_find_any_eq --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 4);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 4);
+}
+
+/*-- vec_find_any_eq_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_eq_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_eq_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 4, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_eq_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 4, __cc);
+}
+
+/*-- vec_find_any_eq_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_eq_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 0);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 0);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 0);
+}
+
+/*-- vec_find_any_eq_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_eq_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_eq_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_eq_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_eq_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_eq_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 0, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_eq_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 0, __cc);
+}
+
+/*-- vec_find_any_ne --------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector signed char __a, vector signed char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector bool char __a, vector bool char __b) {
+  return (vector bool char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne(vector unsigned char __a, vector unsigned char __b) {
+  return (vector bool char)__builtin_s390_vfaeb(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector signed short __a, vector signed short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector bool short __a, vector bool short __b) {
+  return (vector bool short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne(vector unsigned short __a, vector unsigned short __b) {
+  return (vector bool short)__builtin_s390_vfaeh(__a, __b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector signed int __a, vector signed int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector bool int __a, vector bool int __b) {
+  return (vector bool int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 12);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne(vector unsigned int __a, vector unsigned int __b) {
+  return (vector bool int)__builtin_s390_vfaef(__a, __b, 12);
+}
+
+/*-- vec_find_any_ne_cc -----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector signed char __a, vector signed char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return (vector bool char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool char
+vec_find_any_ne_cc(vector unsigned char __a, vector unsigned char __b,
+                   int *__cc) {
+  return (vector bool char)__builtin_s390_vfaebs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector signed short __a, vector signed short __b,
+                   int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector bool short __a, vector bool short __b, int *__cc) {
+  return (vector bool short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_find_any_ne_cc(vector unsigned short __a, vector unsigned short __b,
+                   int *__cc) {
+  return (vector bool short)__builtin_s390_vfaehs(__a, __b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector signed int __a, vector signed int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return (vector bool int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 12, __cc);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_find_any_ne_cc(vector unsigned int __a, vector unsigned int __b,
+                   int *__cc) {
+  return (vector bool int)__builtin_s390_vfaefs(__a, __b, 12, __cc);
+}
+
+/*-- vec_find_any_ne_idx ----------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaeb((vector unsigned char)__a,
+                         (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaeb((vector unsigned char)__a,
+                              (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaeb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaeh((vector unsigned short)__a,
+                         (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaeh((vector unsigned short)__a,
+                              (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaeh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaef((vector unsigned int)__a,
+                         (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaef((vector unsigned int)__a,
+                              (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaef(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_idx_cc -------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_idx_cc(vector signed char __a, vector signed char __b,
+                       int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaebs((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector bool char __a, vector bool char __b, int *__cc) {
+  return __builtin_s390_vfaebs((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                       int *__cc) {
+  return __builtin_s390_vfaebs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_idx_cc(vector signed short __a, vector signed short __b,
+                       int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaehs((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector bool short __a, vector bool short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_idx_cc(vector unsigned short __a, vector unsigned short __b,
+                       int *__cc) {
+  return __builtin_s390_vfaehs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_idx_cc(vector signed int __a, vector signed int __b,
+                       int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaefs((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector bool int __a, vector bool int __b, int *__cc) {
+  return __builtin_s390_vfaefs((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                       int *__cc) {
+  return __builtin_s390_vfaefs(__a, __b, 8, __cc);
+}
+
+/*-- vec_find_any_ne_or_0_idx -----------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx(vector signed char __a, vector signed char __b) {
+  return (vector signed char)
+    __builtin_s390_vfaezb((vector unsigned char)__a,
+                          (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector bool char __a, vector bool char __b) {
+  return __builtin_s390_vfaezb((vector unsigned char)__a,
+                               (vector unsigned char)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vfaezb(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx(vector signed short __a, vector signed short __b) {
+  return (vector signed short)
+    __builtin_s390_vfaezh((vector unsigned short)__a,
+                          (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector bool short __a, vector bool short __b) {
+  return __builtin_s390_vfaezh((vector unsigned short)__a,
+                               (vector unsigned short)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx(vector unsigned short __a, vector unsigned short __b) {
+  return __builtin_s390_vfaezh(__a, __b, 8);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx(vector signed int __a, vector signed int __b) {
+  return (vector signed int)
+    __builtin_s390_vfaezf((vector unsigned int)__a,
+                          (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector bool int __a, vector bool int __b) {
+  return __builtin_s390_vfaezf((vector unsigned int)__a,
+                               (vector unsigned int)__b, 8);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx(vector unsigned int __a, vector unsigned int __b) {
+  return __builtin_s390_vfaezf(__a, __b, 8);
+}
+
+/*-- vec_find_any_ne_or_0_idx_cc --------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_find_any_ne_or_0_idx_cc(vector signed char __a, vector signed char __b,
+                            int *__cc) {
+  return (vector signed char)
+    __builtin_s390_vfaezbs((vector unsigned char)__a,
+                           (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector bool char __a, vector bool char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs((vector unsigned char)__a,
+                                (vector unsigned char)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_find_any_ne_or_0_idx_cc(vector unsigned char __a, vector unsigned char __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezbs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_find_any_ne_or_0_idx_cc(vector signed short __a, vector signed short __b,
+                            int *__cc) {
+  return (vector signed short)
+    __builtin_s390_vfaezhs((vector unsigned short)__a,
+                           (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector bool short __a, vector bool short __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezhs((vector unsigned short)__a,
+                                (vector unsigned short)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_find_any_ne_or_0_idx_cc(vector unsigned short __a,
+                            vector unsigned short __b, int *__cc) {
+  return __builtin_s390_vfaezhs(__a, __b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_find_any_ne_or_0_idx_cc(vector signed int __a, vector signed int __b,
+                            int *__cc) {
+  return (vector signed int)
+    __builtin_s390_vfaezfs((vector unsigned int)__a,
+                           (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector bool int __a, vector bool int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs((vector unsigned int)__a,
+                                (vector unsigned int)__b, 8, __cc);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_find_any_ne_or_0_idx_cc(vector unsigned int __a, vector unsigned int __b,
+                            int *__cc) {
+  return __builtin_s390_vfaezfs(__a, __b, 8, __cc);
+}
+
+#undef __constant_pow2_range
+#undef __constant_range
+#undef __constant
+#undef __ATTRS_o
+#undef __ATTRS_o_ai
+#undef __ATTRS_ai
+
+#else
+
+#error "Use -fzvector to enable vector extensions"
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/wmmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/wmmintrin.h
new file mode 100644
index 000000000..a2d931010
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/wmmintrin.h
@@ -0,0 +1,33 @@
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WMMINTRIN_H
+#define _WMMINTRIN_H
+
+#include <emmintrin.h>
+
+#include <__wmmintrin_aes.h>
+
+#include <__wmmintrin_pclmul.h>
+
+#endif /* _WMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/x86intrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/x86intrin.h
new file mode 100644
index 000000000..81a404f55
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/x86intrin.h
@@ -0,0 +1,85 @@
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__3dNOW__)
+#include <mm3dnow.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+#include <bmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
+#include <bmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
+#include <lzcntintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
+#include <popcntintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
+#include <rdseedintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
+#include <prfchwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE4A__)
+#include <ammintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA4__)
+#include <fma4intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XOP__)
+#include <xopintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__TBM__)
+#include <tbmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
+#include <f16cintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
+#include <mwaitxintrin.h>
+#endif
+
+/* FIXME: LWP */
+
+#endif /* __X86INTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xmmintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xmmintrin.h
new file mode 100644
index 000000000..dc31b85cf
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xmmintrin.h
@@ -0,0 +1,2972 @@
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16)));
+
+/* Unsigned types */
+typedef unsigned int __v4su __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
+
+/// \brief Adds the 32-bit float values in the low-order bits of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
+///    of the lower 32 bits of both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+  __a[0] += __b[0];
+  return __a;
+}
+
+/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of
+///    the addition.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the sums of both
+///    operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a + (__v4sf)__b);
+}
+
+/// \brief Subtracts the 32-bit float value in the low-order bits of the second
+///    operand from the corresponding value in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
+///    of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
+///    bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    difference of the lower 32 bits of both operands. The upper 96 bits are
+///    copied from the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+  __a[0] -= __b[0];
+  return __a;
+}
+
+/// \brief Subtracts each of the values of the second operand from the first
+///    operand, both of which are 128-bit vectors of [4 x float] and returns
+///    the results of the subtraction.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the differences between
+///    both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a - (__v4sf)__b);
+}
+
+/// \brief Multiplies two 32-bit float values in the low-order bits of the
+///    operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the product of the lower
+///    32 bits of both operands. The upper 96 bits are copied from the upper 96
+///    bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+  __a[0] *= __b[0];
+  return __a;
+}
+
+/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the
+///    results of the multiplication.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the products of both
+///    operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a * (__v4sf)__b);
+}
+
+/// \brief Divides the value in the low-order 32 bits of the first operand by
+///    the corresponding value in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
+///    bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
+///    of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of the
+///    lower 32 bits of both operands. The upper 96 bits are copied from the
+///    upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+  __a[0] /= __b[0];
+  return __a;
+}
+
+/// \brief Divides two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of both
+///    operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4sf)__a / (__v4sf)__b);
+}
+
+/// \brief Calculates the square root of the value stored in the low-order bits
+///    of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the square root of the
+///    value in the low-order bits of the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+/// \brief Calculates the square roots of the values stored in a 128-bit vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the square roots of the
+///    values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_sqrtps((__v4sf)__a);
+}
+
+/// \brief Calculates the approximate reciprocal of the value stored in the
+///    low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocal of the value in the low-order bits of the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rcpss((__v4sf)__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+/// \brief Calculates the approximate reciprocals of the values stored in a
+///    128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocals of the values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ps(__m128 __a)
+{
+  return __builtin_ia32_rcpps((__v4sf)__a);
+}
+
+/// \brief Calculates the approximate reciprocal of the square root of the value
+///    stored in the low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocal of the square root of the value in the low-order bits of the
+///    operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ss(__m128 __a)
+{
+  __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a);
+  return (__m128) { __c[0], __a[1], __a[2], __a[3] };
+}
+
+/// \brief Calculates the approximate reciprocals of the square roots of the
+///    values stored in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+///    reciprocals of the square roots of the values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ps(__m128 __a)
+{
+  return __builtin_ia32_rsqrtps((__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands and returns the lesser value in the low-order bits of the
+///    vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    minimum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the minimum values
+///    between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands and returns the greater value in the low-order bits of a 128-bit
+///    vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    maximum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater
+///    of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the maximum values
+///    between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector containing one of the source operands.
+/// \param __b
+///    A 128-bit vector containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+///    values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4su)__a & (__v4su)__b);
+}
+
+/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using
+///    the one's complement of the values contained in the first source
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the first source operand. The
+///    one's complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the second source operand.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+///    one's complement of the first operand and the values in the second
+///    operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)(~(__v4su)__a & (__v4su)__b);
+}
+
+/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
+///    values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4su)__a | (__v4su)__b);
+}
+
+/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
+///    of the values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)((__v4su)__a ^ (__v4su)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for equality and returns the result of the comparison in the
+///    low-order bits of a vector [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is less than the
+///    corresponding value in the second operand and returns the result of the
+///    comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is less than or
+///    equal to the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is greater than
+///    the corresponding value in the second operand and returns the result of
+///    the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is greater than
+///    or equal to the corresponding value in the second operand and returns
+///    the result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for inequality and returns the result of the comparison in the
+///    low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] for inequality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not less than
+///    the corresponding value in the second operand and returns the result of
+///    the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not less than
+///    or equal to the corresponding value in the second operand and returns
+///    the result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not greater
+///    than the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is not greater
+///    than or equal to the corresponding value in the second operand and
+///    returns the result of the comparison in the low-order bits of a vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_shufflevector((__v4sf)__a,
+                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
+                                         4, 1, 2, 3);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is ordered with
+///    respect to the corresponding value in the second operand and returns the
+///    result of the comparison in the low-order bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the value in the first operand is unordered
+///    with respect to the corresponding value in the second operand and
+///    returns the result of the comparison in the low-order bits of a vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the operands. The lower
+///    32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+///    in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares each of the corresponding 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands for equality and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is less than the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is less than or equal to the
+///    second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is greater than the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is greater than or equal to
+///    the second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Compares two 32-bit float values in the low-order bits of both
+///    operands to determine if the first operand is not equal to the second
+///    operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine equality and returns
+///    the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    less than the second operand and returns the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    less than or equal to the second operand and returns the result of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    greater than the second operand and returns the result of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine if the first operand is
+///    greater than or equal to the second operand and returns the result of
+///    the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Performs an unordered comparison of two 32-bit float values using
+///    the low-order bits of both operands to determine inequality and returns
+///    the result of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the comparison.
+/// \returns An integer containing the comparison results.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si((__v4sf)__a);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvt_ss2si(__m128 __a)
+{
+  return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvtss2si64((__v4sf)__a);
+}
+
+#endif
+
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+}
+
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvt_ps2pi(__m128 __a)
+{
+  return _mm_cvtps_pi32(__a);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_si32(__m128 __a)
+{
+  return __builtin_ia32_cvttss2si((__v4sf)__a);
+}
+
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtt_ss2si(__m128 __a)
+{
+  return _mm_cvttss_si32(__a);
+}
+
+#ifdef __x86_64__
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer, truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_si64(__m128 __a)
+{
+  return __builtin_ia32_cvttss2si64((__v4sf)__a);
+}
+#endif
+
+/// \brief Converts two low-order float values in a 128-bit vector of
+///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
+///    when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
+///   instructions.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvttps_pi32(__m128 __a)
+{
+  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+}
+
+/// \brief Converts two low-order float values in a 128-bit vector of [4 x
+///    float] into a 64-bit vector of [2 x i32], truncating the result when it
+///    is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtt_ps2pi(__m128 __a)
+{
+  return _mm_cvttps_pi32(__a);
+}
+
+/// \brief Converts a 32-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination vector are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+/// \brief Converts a 32-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+  return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+/// \brief Converts a 64-bit signed integer value into a floating point value
+///    and writes it to the lower 32 bits of the destination. The remaining
+///    higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied from
+///    the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+  __a[0] = __b;
+  return __a;
+}
+
+#endif
+
+/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+///    floating point values and writes them to the lower 64-bits of the
+///    destination. The remaining higher order elements of the destination are
+///    copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
+///    and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied from
+///    the upper 64 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+}
+
+/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two
+///    floating point values and writes them to the lower 64-bits of the
+///    destination. The remaining higher order elements of the destination are
+///    copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
+///    and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value from the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+  return _mm_cvtpi32_ps(__a, __b);
+}
+
+/// \brief Extracts a float value contained in the lower 32 bits of a vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+///    used in the extraction.
+/// \returns A 32-bit float containing the extracted value.
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm_cvtss_f32(__m128 __a)
+{
+  return __a[0];
+}
+
+/// \brief Loads two packed float values from the address \a __p into the
+///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
+///     are copied from the low-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
+///    of the destination.
+/// \param __p
+///    A pointer to two packed float values. Bits [63:0] are written to bits
+///    [127:64] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadh_pi_struct {
+    __mm_loadh_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+/// \brief Loads two packed float values from the address \a __p into the
+///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
+///    are copied from the high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
+///    [127:64] of the destination.
+/// \param __p
+///    A pointer to two packed float values. Bits [63:0] are written to bits
+///    [63:0] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+  struct __mm_loadl_pi_struct {
+    __mm_loadl_pi_v2f32 __u;
+  } __attribute__((__packed__, __may_alias__));
+  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits of the vector are initialized with the single-precision
+///    floating-point value loaded from a specified memory location. The upper
+///    96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location containing a single-precision
+///    floating-point value.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+///    lower 32 bits contain the value loaded from the memory location. The
+///    upper 96 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ss(const float *__p)
+{
+  struct __mm_load_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  return (__m128){ __u, 0, 0, 0 };
+}
+
+/// \brief Loads a 32-bit float value and duplicates it to all four vector
+///    elements of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a float value to be loaded and duplicated.
+/// \returns A 128-bit vector of [4 x float] containing the loaded and
+///    duplicated values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load1_ps(const float *__p)
+{
+  struct __mm_load1_ps_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  return (__m128){ __u, __u, __u, __u };
+}
+
+#define        _mm_load_ps1(p) _mm_load1_ps(p)
+
+/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded valus.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ps(const float *__p)
+{
+  return *(__m128*)__p;
+}
+
+/// \brief Loads a 128-bit floating-point vector of [4 x float] from an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadu_ps(const float *__p)
+{
+  struct __loadu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((struct __loadu_ps*)__p)->__v;
+}
+
+/// \brief Loads four packed float values, in reverse order, from an aligned
+///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
+///    in reverse order.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadr_ps(const float *__p)
+{
+  __m128 __a = _mm_load_ps(__p);
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
+}
+
+/// \brief Create a 128-bit vector of [4 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x float] containing undefined values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_undefined_ps(void)
+{
+  return (__m128)__builtin_ia32_undef128();
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits of the vector are initialized with the specified single-precision
+///    floating-point value. The upper 96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize the lower 32
+///    bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+///    lower 32 bits contain the value provided in the source operand. The
+///    upper 96 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ss(float __w)
+{
+  return (__m128){ __w, 0, 0, 0 };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+///    of the four single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set1_ps(float __w)
+{
+  return (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each
+///    of the four single-precision floating-point vector elements set to the
+///    specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
+///
+/// \param __w
+///    A single-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps1(float __w)
+{
+    return _mm_set1_ps(__w);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]
+///    initialized with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __z
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \param __y
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __x
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __w
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __w, __x, __y, __z };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float],
+///    initialized in reverse order with the specified 32-bit single-precision
+///    float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __z
+///    A single-precision floating-point value used to initialize bits [31:0]
+///    of the result.
+/// \param __y
+///    A single-precision floating-point value used to initialize bits [63:32]
+///    of the result.
+/// \param __x
+///    A single-precision floating-point value used to initialize bits [95:64]
+///    of the result.
+/// \param __w
+///    A single-precision floating-point value used to initialize bits [127:96]
+///    of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return (__m128){ __z, __y, __x, __w };
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [4 x float] with
+///    all elements set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0, 0, 0, 0 };
+}
+
+/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
+}
+
+/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
+///     memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float values.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
+}
+
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
+///     memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ss(float *__p, __m128 __a)
+{
+  struct __mm_store_ss_struct {
+    float __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+/// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+  struct __storeu_ps {
+    __m128 __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ps*)__p)->__v = __a;
+}
+
+/// \brief Stores a 128-bit vector of [4 x float] into an aligned memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 16-byte aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps(float *__p, __m128 __a)
+{
+  *(__m128*)__p = __a;
+}
+
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by \a __p.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
+  _mm_store_ps(__p, __a);
+}
+
+/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+///    four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+///    of the four contiguous elements pointed by \a __p.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps1(float *__p, __m128 __a)
+{
+  return _mm_store1_ps(__p, __a);
+}
+
+/// \brief Stores float values from a 128-bit vector of [4 x float] to an
+///    aligned memory location in reverse order.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
+///    instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit memory location. The address of the memory
+///    location has to be 128-bit aligned.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_ps(float *__p, __m128 __a)
+{
+  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
+  _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+   Sema doesn't do any form of constant propagation yet. */
+
+/// \brief Loads one cache line of data from the specified address to a location
+///    closer to the processor.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_prefetch(const void * a, const int sel);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
+///
+/// \param a
+///    A pointer to a memory location containing a cache line of data.
+/// \param sel
+///    A predefined integer constant specifying the type of prefetch
+///    operation: \n
+///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
+///    PREFETCHNTA instruction will be generated. \n
+///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
+///    be generated. \n
+///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
+///    be generated. \n
+///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
+///    be generated.                                   
+#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
+#endif
+
+/// \brief Stores a 64-bit integer in the specified aligned memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location used to store the register value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+  __builtin_ia32_movntq(__p, __a);
+}
+
+/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
+///    128-bit aligned memory location. To minimize caching, the data is flagged
+///    as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit aligned memory location that will receive the
+///    integer values.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ps(float *__p, __m128 __a)
+{
+  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief Forces strong memory ordering (serialization) between store
+///    instructions preceding this instruction and store instructions following
+///    this instruction, ensuring the system completes all previous stores
+///    before executing subsequent stores.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
+///
+void _mm_sfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+///    returns it, as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_extract_pi(__m64 a, int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param a
+///    A 64-bit vector of [4 x i16].
+/// \param n
+///    An immediate integer operand that determines which bits are extracted: \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
+///    3: Bits [63:48] are copied to the destination.
+/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
+#define _mm_extract_pi16(a, n) __extension__ ({ \
+  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
+
+/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
+///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
+///    specified by the immediate operand \a n.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_insert_pi(__m64 a, int d, int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param a
+///    A 64-bit vector of [4 x i16].
+/// \param d
+///    An integer. The lower 16-bit value from this operand is written to the
+///    destination at the offset specified by operand \a n.
+/// \param n
+///    An immediate integer operant that determines which the bits to be used
+///    in the destination. \n
+///    0: Bits [15:0] are copied to the destination. \n
+///    1: Bits [31:16] are copied to the destination. \n
+///    2: Bits [47:32] are copied to the destination. \n
+///    3: Bits [63:48] are copied to the destination.  \n
+///    The remaining bits in the destination are copied from the corresponding
+///    bits in operand \a a.
+/// \returns A 64-bit integer vector containing the copied packed data from the
+///    operands.
+#define _mm_insert_pi16(a, d, n) __extension__ ({ \
+  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
+
+/// \brief Compares each of the corresponding packed 16-bit integer values of
+///    the 64-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Compares each of the corresponding packed 16-bit integer values of
+///    the 64-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Takes the most significant bit from each 8-bit element in a 64-bit
+///    integer vector to create a 16-bit mask value. Zero-extends the value to
+///    32-bit integer and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bit from each 8-bit element in the operand,
+///    written to bits [15:0].
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pi8(__m64 __a)
+{
+  return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+/// \brief Multiplies packed 16-bit unsigned integer values and writes the
+///    high-order 16 bits of each 32-bit product to the corresponding bits in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the products of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+///    destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
+///
+/// \param a
+///    A 64-bit integer vector containing the values to be shuffled.
+/// \param n
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from \a a. The destinations within the 64-bit destination are
+///    assigned values as follows: \n
+///    Bits [1:0] are used to assign values to bits [15:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [31:16] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [47:32] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [63:48] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: assigned from bits [15:0] of \a a. \n
+///    01: assigned from bits [31:16] of \a a. \n
+///    10: assigned from bits [47:32] of \a a. \n
+///    11: assigned from bits [63:48] of \a a.
+/// \returns A 64-bit integer vector containing the shuffled values.
+#define _mm_shuffle_pi16(a, n) __extension__ ({ \
+  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
+
+/// \brief Conditionally copies the values from each 8-bit element in the first
+///    64-bit integer vector operand to the specified memory location, as
+///    specified by the most significant bit in the corresponding element in the
+///    second 64-bit integer vector operand. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
+///
+/// \param __d
+///    A 64-bit integer vector containing the values with elements to be copied.
+/// \param __n
+///    A 64-bit integer vector operand. The most significant bit from each 8-bit
+///    element determines whether the corresponding element in operand \a __d
+///    is copied. If the most significant bit of a given element is 1, the
+///    corresponding element in operand \a __d is copied.
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the conditionally
+///    copied integer values. The address of the memory location does not have
+///    to be aligned.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two
+///    64-bit vector operands and computes the absolute value for each of the
+///    difference. Then sum of the 8 absolute differences is written to the
+///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
+///    sets of absolute differences between both operands. The upper bits are
+///    cleared.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned
+///    integer value.
+///
+///    There are several groups of macros associated with this
+///    intrinsic, including:
+///    <ul>
+///    <li>
+///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
+///      _MM_GET_EXCEPTION_STATE().
+///    </li>
+///    <li>
+///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
+///    </li>            
+///    <li>
+///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
+///    </li>
+///    <li> 
+///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
+///    </li>
+///    <li> 
+///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+///      _MM_GET_DENORMALS_ZERO_MODE().
+///    </li>
+///    </ul>
+///
+///    For example, the expression below checks if an overflow exception has
+///    occurred:
+///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
+///
+///    The following example gets the current rounding mode:
+///      _MM_GET_ROUNDING_MODE()
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
+///
+/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
+///    register.
+unsigned int _mm_getcsr(void);
+
+/// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
+///   
+///    There are several groups of macros associated with this intrinsic,
+///    including:
+///    <ul>
+///    <li> 
+///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
+///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
+///    </li>
+///    <li>
+///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
+///      of these macros.
+///    </li>
+///    <li>
+///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
+///    </li>
+///    <li>
+///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
+///      one of these macros.
+///    </li>
+///    <li>
+///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
+///    </li>
+///    </ul>
+///
+///    For example, the following expression causes subsequent floating-point
+///    operations to round up:
+///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
+///
+///    The following example sets the DAZ and FTZ flags:
+///      void setFlags() {
+///        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)
+///        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)
+///      }
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
+///
+/// \param __i
+///    A 32-bit unsigned integer value to be written to the MXCSR register.
+void _mm_setcsr(unsigned int);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
+///    specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param mask
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from \ a and \a b. \n
+///    Bits [3:0] specify the values copied from operand \a a. \n
+///    Bits [7:4] specify the values copied from operand \a b. \n
+///    The destinations within the 128-bit destination are assigned values as
+///    follows: \n
+///    Bits [1:0] are used to assign values to bits [31:0] in the
+///    destination. \n
+///    Bits [3:2] are used to assign values to bits [63:32] in the
+///    destination. \n
+///    Bits [5:4] are used to assign values to bits [95:64] in the
+///    destination. \n
+///    Bits [7:6] are used to assign values to bits [127:96] in the
+///    destination. \n
+///    Bit value assignments: \n
+///    00: Bits [31:0] copied from the specified operand. \n
+///    01: Bits [63:32] copied from the specified operand. \n
+///    10: Bits [95:64] copied from the specified operand. \n
+///    11: Bits [127:96] copied from the specified operand.
+/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
+#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
+  (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                  0 + (((mask) >> 0) & 0x3), \
+                                  0 + (((mask) >> 2) & 0x3), \
+                                  4 + (((mask) >> 4) & 0x3), \
+                                  4 + (((mask) >> 6) & 0x3)); })
+
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [95:64] are written to bits [31:0] of the destination. \n
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float].
+///    Bits [95:64] are written to bits [63:32] of the destination. \n
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
+}
+
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [31:0] of the destination.  \n
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [31:0] are written to bits [63:32] of the destination. \n
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    32 bits are set to the lower 32 bits of the second parameter. The upper
+///    96 bits are set to the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
+///    written to the upper 96 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
+///    written to the lower 32 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    64 bits are set to the upper 64 bits of the second parameter. The upper
+///    64 bits are set to the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+///    written to the upper 64 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+///    written to the lower 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
+}
+
+/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower
+///    64 bits are set to the lower 64 bits of the first parameter. The upper
+///    64 bits are set to the lower 64 bits of the second parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+///    written to the upper 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
+}
+
+/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
+///    from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi16(__b, __a);
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a
+///    128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
+///    destination are copied from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu16_ps(__m64 __a)
+{
+  __m64 __b, __c;
+  __m128 __r;
+
+  __b = _mm_setzero_si64();
+  __c = _mm_unpackhi_pi16(__a, __b);
+  __r = _mm_setzero_ps();
+  __r = _mm_cvtpi32_ps(__r, __c);
+  __r = _mm_movelh_ps(__r, __r);
+  __c = _mm_unpacklo_pi16(__a, __b);
+  __r = _mm_cvtpi32_ps(__r, __c);
+
+  return __r;
+}
+
+/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
+///    into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
+///    from the corresponding lower 4 elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_cmpgt_pi8(__b, __a);
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
+///    vector of [8 x u8] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
+///    destination are copied from the corresponding lower 4 elements in this
+///    operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+///    values from the source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpu8_ps(__m64 __a)
+{
+  __m64 __b;
+
+  __b = _mm_setzero_si64();
+  __b = _mm_unpacklo_pi8(__a, __b);
+
+  return _mm_cvtpi16_ps(__b);
+}
+
+/// \brief Converts the two 32-bit signed integer values from each 64-bit vector
+///    operand of [2 x i32] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
+///    copied from the elements in this operand.
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
+///    copied from the elements in this operand.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    copied and converted values from the first operand. The upper 64 bits
+///    contain the copied and converted values from the second operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+  __m128 __c;
+
+  __c = _mm_setzero_ps();
+  __c = _mm_cvtpi32_ps(__c, __b);
+  __c = _mm_movelh_ps(__c, __c);
+
+  return _mm_cvtpi32_ps(__c, __a);
+}
+
+/// \brief Converts each single-precision floating-point element of a 128-bit
+///    floating-point vector of [4 x float] into a 16-bit signed integer, and
+///    packs the results into a 64-bit integer vector of [4 x i16]. If the
+///    floating-point element is NaN or infinity, or if the floating-point
+///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
+///    to 0x8000. Otherwise if the floating-point element is greater than
+///    0x7FFF, it is converted to 0x7FFF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+///    values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi16(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi32(__a);
+  __a = _mm_movehl_ps(__a, __a);
+  __c = _mm_cvtps_pi32(__a);
+
+  return _mm_packs_pi32(__b, __c);
+}
+
+/// \brief Converts each single-precision floating-point element of a 128-bit
+///    floating-point vector of [4 x float] into an 8-bit signed integer, and
+///    packs the results into the lower 32 bits of a 64-bit integer vector of
+///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
+///    floating-point element is NaN or infinity, or if the floating-point
+///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
+///    to 0x80. Otherwise if the floating-point element is greater than 0x7F,
+///    it is converted to 0x7F.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
+///   instruction.
+///
+/// \param __a
+///    128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
+///    converted values and the uppper 32 bits are set to zero.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtps_pi8(__m128 __a)
+{
+  __m64 __b, __c;
+
+  __b = _mm_cvtps_pi16(__a);
+  __c = _mm_setzero_si64();
+
+  return _mm_packs_pi16(__b, __c);
+}
+
+/// \brief Extracts the sign bits from each single-precision floating-point
+///    element of a 128-bit floating-point vector of [4 x float] and returns the
+///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
+///    single-precision floating-point element of the parameter. Bits [31:4] are
+///    set to zero.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_ps(__m128 __a)
+{
+  return __builtin_ia32_movmskps((__v4sf)__a);
+}
+
+
+#define _MM_ALIGN16 __attribute__((aligned(16)))
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID    (0x0001)
+#define _MM_EXCEPT_DENORM     (0x0002)
+#define _MM_EXCEPT_DIV_ZERO   (0x0004)
+#define _MM_EXCEPT_OVERFLOW   (0x0008)
+#define _MM_EXCEPT_UNDERFLOW  (0x0010)
+#define _MM_EXCEPT_INEXACT    (0x0020)
+#define _MM_EXCEPT_MASK       (0x003f)
+
+#define _MM_MASK_INVALID      (0x0080)
+#define _MM_MASK_DENORM       (0x0100)
+#define _MM_MASK_DIV_ZERO     (0x0200)
+#define _MM_MASK_OVERFLOW     (0x0400)
+#define _MM_MASK_UNDERFLOW    (0x0800)
+#define _MM_MASK_INEXACT      (0x1000)
+#define _MM_MASK_MASK         (0x1f80)
+
+#define _MM_ROUND_NEAREST     (0x0000)
+#define _MM_ROUND_DOWN        (0x2000)
+#define _MM_ROUND_UP          (0x4000)
+#define _MM_ROUND_TOWARD_ZERO (0x6000)
+#define _MM_ROUND_MASK        (0x6000)
+
+#define _MM_FLUSH_ZERO_MASK   (0x8000)
+#define _MM_FLUSH_ZERO_ON     (0x8000)
+#define _MM_FLUSH_ZERO_OFF    (0x0000)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+  __m128 tmp3, tmp2, tmp1, tmp0; \
+  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+  (row0) = _mm_movelh_ps(tmp0, tmp2); \
+  (row1) = _mm_movehl_ps(tmp2, tmp0); \
+  (row2) = _mm_movelh_ps(tmp1, tmp3); \
+  (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
+#include <emmintrin.h>
+#endif
+
+#endif /* __XMMINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xopintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xopintrin.h
new file mode 100644
index 000000000..bdf0cec32
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xopintrin.h
@@ -0,0 +1,782 @@
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#include <fma4intrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubw_epi8(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubd_epi16(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubq_epi32(__m128i __A)
+{
+  return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi16(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi32(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
+
+#define _mm_roti_epi64(A, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+                                  (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+                                  (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epu32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+                                  (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epu64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+                                  (__v2di)(__m128i)(B), (N)); })
+
+#define _mm_com_epi8(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+                                 (__v16qi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi16(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+                                 (__v8hi)(__m128i)(B), (N)); })
+
+#define _mm_com_epi32(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+                                 (__v4si)(__m128i)(B), (N)); })
+
+#define _mm_com_epi64(A, B, N) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+                                 (__v2di)(__m128i)(B), (N)); })
+
+#define _MM_PCOMCTRL_LT    0
+#define _MM_PCOMCTRL_LE    1
+#define _MM_PCOMCTRL_GT    2
+#define _MM_PCOMCTRL_GE    3
+#define _MM_PCOMCTRL_EQ    4
+#define _MM_PCOMCTRL_NEQ   5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE  7
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+                                     (__v2df)(__m128d)(Y), \
+                                     (__v2di)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+                                        (__v4df)(__m256d)(Y), \
+                                        (__v4di)(__m256i)(C), (I)); })
+
+#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+                                    (__v4si)(__m128i)(C), (I)); })
+
+#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+                                       (__v8sf)(__m256)(Y), \
+                                       (__v8si)(__m256i)(C), (I)); })
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ss(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_sd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ps(__m128 __A)
+{
+  return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_pd(__m128d __A)
+{
+  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_frcz_ps(__m256 __A)
+{
+  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_frcz_pd(__m256d __A)
+{
+  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __XOPINTRIN_H */
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsavecintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsavecintrin.h
new file mode 100644
index 000000000..598470a68
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsavecintrin.h
@@ -0,0 +1,48 @@
+/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVECINTRIN_H
+#define __XSAVECINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsavec")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsavec64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsaveintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsaveintrin.h
new file mode 100644
index 000000000..a2e6b2e74
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsaveintrin.h
@@ -0,0 +1,58 @@
+/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEINTRIN_H
+#define __XSAVEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsave")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsave64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xrstor64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsaveoptintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsaveoptintrin.h
new file mode 100644
index 000000000..d3faae78b
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsaveoptintrin.h
@@ -0,0 +1,48 @@
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEOPTINTRIN_H
+#define __XSAVEOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaveopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt64(void *__p, unsigned long long __m) {
+  return __builtin_ia32_xsaveopt64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsavesintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsavesintrin.h
new file mode 100644
index 000000000..c5e540a86
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xsavesintrin.h
@@ -0,0 +1,58 @@
+/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVESINTRIN_H
+#define __XSAVESINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("xsaves")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xrstors64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves64(void *__p, unsigned long long __m) {
+  __builtin_ia32_xsaves64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xtestintrin.h b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xtestintrin.h
new file mode 100644
index 000000000..9d3378fd1
--- /dev/null
+++ b/externals/clang/4.0.0/osx/lib/clang/4.0.0/include/xtestintrin.h
@@ -0,0 +1,41 @@
+/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XTESTINTRIN_H
+#define __XTESTINTRIN_H
+
+/* xtest returns non-zero if the instruction is executed within an RTM or active
+ * HLE region. */
+/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
+ * supported. */
+static __inline__ int
+    __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+    _xtest(void) {
+  return __builtin_ia32_xtest();
+}
+
+#endif
diff --git a/externals/clang/4.0.0/win32/bin/clang.exe b/externals/clang/4.0.0/win32/bin/clang.exe
new file mode 100644
index 000000000..ac2283576
Binary files /dev/null and b/externals/clang/4.0.0/win32/bin/clang.exe differ
diff --git a/kaplademo/source/compiler/vc14win32-PhysX_4.0/DemoFramework.vcxproj b/kaplademo/source/compiler/vc14win32-PhysX_4.0/DemoFramework.vcxproj
index add35da86..a182f10ed 100644
--- a/kaplademo/source/compiler/vc14win32-PhysX_4.0/DemoFramework.vcxproj
+++ b/kaplademo/source/compiler/vc14win32-PhysX_4.0/DemoFramework.vcxproj
@@ -24,7 +24,7 @@
 	</PropertyGroup>
 	<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'" Label="Configuration">
-		<ConfigurationType>Application</ConfigurationType>
+		<ConfigurationType>StaticLibrary</ConfigurationType>
 		<PlatformToolset>v140</PlatformToolset>
 	</PropertyGroup>
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'" Label="Configuration">
@@ -60,7 +60,7 @@
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'">
 		<OutDir>./../../../bin/VC14WIN32/DEBUG\</OutDir>
 		<IntDir>./Win32/DemoFramework/debug\</IntDir>
-		<TargetExt>.exe</TargetExt>
+		<TargetExt>.lib</TargetExt>
 		<TargetName>$(ProjectName)DEBUG</TargetName>
 		<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 		<CodeAnalysisRules />
@@ -83,19 +83,17 @@
 			<PrecompiledHeaderFile></PrecompiledHeaderFile>
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
-		<Link>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-			<OutputFile>$(OutDir)$(ProjectName)DEBUG.exe</OutputFile>
+		<Lib>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+			<OutputFile>$(OutDir)$(ProjectName)DEBUG.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC14WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)DEBUG.exe.pdb</ProgramDatabaseFile>
-			<SubSystem>Console</SubSystem>
-			<ImportLibrary>./../../../lib/VC14WIN32/DEBUG/$(TargetName).lib</ImportLibrary>
-			<GenerateDebugInformation>true</GenerateDebugInformation>
+			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)DEBUG.lib.pdb</ProgramDatabaseFile>
 			<TargetMachine>MachineX86</TargetMachine>
-		</Link>
+		</Lib>
 		<ResourceCompile>
 		</ResourceCompile>
 		<ProjectReference>
+			<LinkLibraryDependencies>true</LinkLibraryDependencies>
 		</ProjectReference>
 	</ItemDefinitionGroup>
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'">
@@ -126,7 +124,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName).lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC14WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName).lib.pdb</ProgramDatabaseFile>
@@ -166,7 +164,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)CHECKED.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC14WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)CHECKED.lib.pdb</ProgramDatabaseFile>
@@ -206,7 +204,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)PROFILE.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC14WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)PROFILE.lib.pdb</ProgramDatabaseFile>
diff --git a/kaplademo/source/compiler/vc14win64-PhysX_4.0/DemoFramework.vcxproj b/kaplademo/source/compiler/vc14win64-PhysX_4.0/DemoFramework.vcxproj
index 668c493cc..e116f97a8 100644
--- a/kaplademo/source/compiler/vc14win64-PhysX_4.0/DemoFramework.vcxproj
+++ b/kaplademo/source/compiler/vc14win64-PhysX_4.0/DemoFramework.vcxproj
@@ -84,7 +84,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)DEBUG.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC14WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)DEBUG.lib.pdb</ProgramDatabaseFile>
@@ -124,7 +124,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName).lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC14WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName).lib.pdb</ProgramDatabaseFile>
@@ -164,7 +164,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)CHECKED.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC14WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)CHECKED.lib.pdb</ProgramDatabaseFile>
@@ -204,7 +204,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)PROFILE.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.VC140.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC14WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)PROFILE.lib.pdb</ProgramDatabaseFile>
diff --git a/kaplademo/source/compiler/vc15win32-PhysX_4.0/DemoFramework.vcxproj b/kaplademo/source/compiler/vc15win32-PhysX_4.0/DemoFramework.vcxproj
index 9b1f9b56f..046f8636e 100644
--- a/kaplademo/source/compiler/vc15win32-PhysX_4.0/DemoFramework.vcxproj
+++ b/kaplademo/source/compiler/vc15win32-PhysX_4.0/DemoFramework.vcxproj
@@ -24,7 +24,7 @@
 	</PropertyGroup>
 	<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'" Label="Configuration">
-		<ConfigurationType>Application</ConfigurationType>
+		<ConfigurationType>StaticLibrary</ConfigurationType>
 		<PlatformToolset>v141</PlatformToolset>
 	</PropertyGroup>
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'" Label="Configuration">
@@ -57,7 +57,7 @@
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='debug|Win32'">
 		<OutDir>./../../../bin/VC15WIN32/DEBUG\</OutDir>
 		<IntDir>./Win32/DemoFramework/debug\</IntDir>
-		<TargetExt>.exe</TargetExt>
+		<TargetExt>.lib</TargetExt>
 		<TargetName>$(ProjectName)DEBUG</TargetName>
 		<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 		<CodeAnalysisRules />
@@ -77,19 +77,17 @@
 			<PrecompiledHeaderFile></PrecompiledHeaderFile>
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
-		<Link>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-			<OutputFile>$(OutDir)$(ProjectName)DEBUG.exe</OutputFile>
+		<Lib>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+			<OutputFile>$(OutDir)$(ProjectName)DEBUG.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC15WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)DEBUG.exe.pdb</ProgramDatabaseFile>
-			<SubSystem>Console</SubSystem>
-			<ImportLibrary>./../../../lib/VC15WIN32/DEBUG/$(TargetName).lib</ImportLibrary>
-			<GenerateDebugInformation>true</GenerateDebugInformation>
+			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)DEBUG.lib.pdb</ProgramDatabaseFile>
 			<TargetMachine>MachineX86</TargetMachine>
-		</Link>
+		</Lib>
 		<ResourceCompile>
 		</ResourceCompile>
 		<ProjectReference>
+			<LinkLibraryDependencies>true</LinkLibraryDependencies>
 		</ProjectReference>
 	</ItemDefinitionGroup>
 	<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='release|Win32'">
@@ -116,7 +114,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName).lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC15WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName).lib.pdb</ProgramDatabaseFile>
@@ -152,7 +150,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)CHECKED.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC15WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)CHECKED.lib.pdb</ProgramDatabaseFile>
@@ -188,7 +186,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_32.lib;PhysXCooking_32.lib;PhysXCommon_32.lib;PhysXCharacterKinematic_static_32.lib;PhysXVehicle_static_32.lib;PhysXExtensions_static_32.lib;PhysXFoundation_32.lib;PhysXPvdSDK_static_32.lib;PhysXTask_static_32.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)PROFILE.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_32.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN32;./../../../externals/glut-3.7.6/lib/WIN32;./../../../externals/hbao+3.0/lib/WIN32;./../../../externals/cg/2.2/lib;./../../../lib/VC15WIN32;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)PROFILE.lib.pdb</ProgramDatabaseFile>
diff --git a/kaplademo/source/compiler/vc15win64-PhysX_4.0/DemoFramework.vcxproj b/kaplademo/source/compiler/vc15win64-PhysX_4.0/DemoFramework.vcxproj
index d8c35378a..e22eefa79 100644
--- a/kaplademo/source/compiler/vc15win64-PhysX_4.0/DemoFramework.vcxproj
+++ b/kaplademo/source/compiler/vc15win64-PhysX_4.0/DemoFramework.vcxproj
@@ -78,7 +78,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)DEBUG.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC15WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)DEBUG.lib.pdb</ProgramDatabaseFile>
@@ -114,7 +114,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName).lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC15WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName).lib.pdb</ProgramDatabaseFile>
@@ -150,7 +150,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)CHECKED.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC15WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)CHECKED.lib.pdb</ProgramDatabaseFile>
@@ -186,7 +186,7 @@
 			<DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 		</ClCompile>
 		<Lib>
-			<AdditionalDependencies>OpenGL32.lib;glut32.lib;PhysX_64.lib;PhysXCooking_64.lib;PhysXCommon_64.lib;PhysXCharacterKinematic_static_64.lib;PhysXVehicle_static_64.lib;PhysXExtensions_static_64.lib;PhysXFoundation_64.lib;PhysXPvdSDK_static_64.lib;PhysXTask_static_64.lib;cgGL.lib;cg.lib;GFSDK_SSAO_GL.WIN64.lib;%(AdditionalDependencies)</AdditionalDependencies>
+			<AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
 			<OutputFile>$(OutDir)$(ProjectName)PROFILE.lib</OutputFile>
 			<AdditionalLibraryDirectories>./../../../../physx/bin/win.x86_64.vc141.mt/$(ConfigurationName);./../../../externals/glew-1.13.0/lib/WIN64;./../../../externals/glut-3.7.6/lib/WIN64;./../../../externals/hbao+3.0/lib/WIN64;./../../../externals/cg/2.2/lib.x64;./../../../lib/VC15WIN64;./../../../lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
 			<ProgramDatabaseFile>$(OutDir)/$(ProjectName)PROFILE.lib.pdb</ProgramDatabaseFile>
diff --git a/physx/bin/linux.clang/checked/libPhysXGpu_64.so b/physx/bin/linux.clang/checked/libPhysXGpu_64.so
index 251c30da1..b057a2a3d 100644
Binary files a/physx/bin/linux.clang/checked/libPhysXGpu_64.so and b/physx/bin/linux.clang/checked/libPhysXGpu_64.so differ
diff --git a/physx/bin/linux.clang/debug/libPhysXGpu_64.so b/physx/bin/linux.clang/debug/libPhysXGpu_64.so
index 251c30da1..b057a2a3d 100644
Binary files a/physx/bin/linux.clang/debug/libPhysXGpu_64.so and b/physx/bin/linux.clang/debug/libPhysXGpu_64.so differ
diff --git a/physx/bin/linux.clang/profile/libPhysXGpu_64.so b/physx/bin/linux.clang/profile/libPhysXGpu_64.so
index e691ac375..22aecc0c5 100644
Binary files a/physx/bin/linux.clang/profile/libPhysXGpu_64.so and b/physx/bin/linux.clang/profile/libPhysXGpu_64.so differ
diff --git a/physx/bin/linux.clang/release/libPhysXGpu_64.so b/physx/bin/linux.clang/release/libPhysXGpu_64.so
index 5e6ac5296..b50034dea 100644
Binary files a/physx/bin/linux.clang/release/libPhysXGpu_64.so and b/physx/bin/linux.clang/release/libPhysXGpu_64.so differ
diff --git a/physx/bin/win.x86_32.vc120.mt/checked/PhysXGpu_32.dll b/physx/bin/win.x86_32.vc120.mt/checked/PhysXGpu_32.dll
index 292e79ae5..23056bdd7 100644
Binary files a/physx/bin/win.x86_32.vc120.mt/checked/PhysXGpu_32.dll and b/physx/bin/win.x86_32.vc120.mt/checked/PhysXGpu_32.dll differ
diff --git a/physx/bin/win.x86_32.vc120.mt/debug/PhysXGpu_32.dll b/physx/bin/win.x86_32.vc120.mt/debug/PhysXGpu_32.dll
index a310aa020..23056bdd7 100644
Binary files a/physx/bin/win.x86_32.vc120.mt/debug/PhysXGpu_32.dll and b/physx/bin/win.x86_32.vc120.mt/debug/PhysXGpu_32.dll differ
diff --git a/physx/bin/win.x86_32.vc120.mt/profile/PhysXGpu_32.dll b/physx/bin/win.x86_32.vc120.mt/profile/PhysXGpu_32.dll
index dafaa1839..a0744cc84 100644
Binary files a/physx/bin/win.x86_32.vc120.mt/profile/PhysXGpu_32.dll and b/physx/bin/win.x86_32.vc120.mt/profile/PhysXGpu_32.dll differ
diff --git a/physx/bin/win.x86_32.vc120.mt/release/PhysXGpu_32.dll b/physx/bin/win.x86_32.vc120.mt/release/PhysXGpu_32.dll
index 3c20953c3..759b51268 100644
Binary files a/physx/bin/win.x86_32.vc120.mt/release/PhysXGpu_32.dll and b/physx/bin/win.x86_32.vc120.mt/release/PhysXGpu_32.dll differ
diff --git a/physx/bin/win.x86_64.vc140.mt/checked/PhysXGpu_64.dll b/physx/bin/win.x86_64.vc140.mt/checked/PhysXGpu_64.dll
index 61081d28c..9c474a524 100644
Binary files a/physx/bin/win.x86_64.vc140.mt/checked/PhysXGpu_64.dll and b/physx/bin/win.x86_64.vc140.mt/checked/PhysXGpu_64.dll differ
diff --git a/physx/bin/win.x86_64.vc140.mt/debug/PhysXGpu_64.dll b/physx/bin/win.x86_64.vc140.mt/debug/PhysXGpu_64.dll
index 17d8d76bd..17bb9e61b 100644
Binary files a/physx/bin/win.x86_64.vc140.mt/debug/PhysXGpu_64.dll and b/physx/bin/win.x86_64.vc140.mt/debug/PhysXGpu_64.dll differ
diff --git a/physx/bin/win.x86_64.vc140.mt/profile/PhysXGpu_64.dll b/physx/bin/win.x86_64.vc140.mt/profile/PhysXGpu_64.dll
index e6b7d7a9f..23ad04300 100644
Binary files a/physx/bin/win.x86_64.vc140.mt/profile/PhysXGpu_64.dll and b/physx/bin/win.x86_64.vc140.mt/profile/PhysXGpu_64.dll differ
diff --git a/physx/bin/win.x86_64.vc140.mt/release/PhysXGpu_64.dll b/physx/bin/win.x86_64.vc140.mt/release/PhysXGpu_64.dll
index c63bb233e..cb141e5d2 100644
Binary files a/physx/bin/win.x86_64.vc140.mt/release/PhysXGpu_64.dll and b/physx/bin/win.x86_64.vc140.mt/release/PhysXGpu_64.dll differ
diff --git a/physx/buildtools/cmake_generate_projects.py b/physx/buildtools/cmake_generate_projects.py
index ae06a4659..babb6ed38 100644
--- a/physx/buildtools/cmake_generate_projects.py
+++ b/physx/buildtools/cmake_generate_projects.py
@@ -30,6 +30,7 @@ def filterPreset(presetName):
     return False
 
 def noPresetProvided():
+    global input
     print('Preset parameter required, available presets:')
     presetfiles = []
     for file in glob.glob("buildtools/presets/*.xml"):
@@ -53,7 +54,12 @@ def noPresetProvided():
                     '.user <--- ' + presetXml.get('comment'))
                 presetList.append(presetXml.get('name') + '.user')
             counter = counter + 1            
-    mode = int(raw_input('Enter preset number: '))
+    # Fix Python 2.x.
+    try: 
+    	input = raw_input
+    except NameError: 
+    	pass    
+    mode = int(input('Enter preset number: '))
     print('Running generate_projects.bat ' + presetList[mode])
     return presetList[mode]
 
diff --git a/physx/buildtools/cmake_generate_projects.sh b/physx/buildtools/cmake_generate_projects.sh
deleted file mode 100644
index aae3c0184..000000000
--- a/physx/buildtools/cmake_generate_projects.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash +x
-
-python ./buildtools/cmake_generate_projects.py $targetPlatform
diff --git a/physx/documentation/PhysXAPI/files/PxArticulationBase_8h_source.html b/physx/documentation/PhysXAPI/files/PxArticulationBase_8h_source.html
index 500605df3..b79716ad9 100644
--- a/physx/documentation/PhysXAPI/files/PxArticulationBase_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxArticulationBase_8h_source.html
@@ -99,7 +99,7 @@
 <div class="ttc" id="classPxArticulationLink_html"><div class="ttname"><a href="classPxArticulationLink.html">PxArticulationLink</a></div><div class="ttdoc">a component of an articulation that represents a rigid body </div><div class="ttdef"><b>Definition:</b> PxArticulationLink.h:57</div></div>
 <div class="ttc" id="classPxArticulationBase_html_ae8ee210b8b3867a8a0803875c6e53014"><div class="ttname"><a href="classPxArticulationBase.html#ae8ee210b8b3867a8a0803875c6e53014">PxArticulationBase::userData</a></div><div class="ttdeci">void * userData</div><div class="ttdoc">user can assign this to whatever, usually to create a 1:1 relationship with a user object...</div><div class="ttdef"><b>Definition:</b> PxArticulationBase.h:317</div></div>
 <div class="ttc" id="PxGeometryHelpers_8h_html_a323277f755604ba403ae122d11c9b5e1"><div class="ttname"><a href="PxGeometryHelpers_8h.html#a323277f755604ba403ae122d11c9b5e1">getType</a></div><div class="ttdeci">PX_FORCE_INLINE PxGeometryType::Enum getType() const</div><div class="ttdef"><b>Definition:</b> PxGeometryHelpers.h:88</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:112</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:110</div></div>
 <div class="ttc" id="classPxTransform_html"><div class="ttname"><a href="classPxTransform.html">PxTransform</a></div><div class="ttdoc">class representing a rigid euclidean transform as a quaternion and a vector </div><div class="ttdef"><b>Definition:</b> PxTransform.h:48</div></div>
 <div class="ttc" id="classPxBounds3_html"><div class="ttname"><a href="classPxBounds3.html">PxBounds3</a></div><div class="ttdoc">Class representing 3D range or axis aligned bounding box. </div><div class="ttdef"><b>Definition:</b> PxBounds3.h:58</div></div>
 <div class="ttc" id="classPxArticulationBase_html"><div class="ttname"><a href="classPxArticulationBase.html">PxArticulationBase</a></div><div class="ttdoc">a tree structure of bodies connected by joints that is treated as a unit by the dynamics solver ...</div><div class="ttdef"><b>Definition:</b> PxArticulationBase.h:57</div></div>
diff --git a/physx/documentation/PhysXAPI/files/PxArticulationJointReducedCoordinate_8h_source.html b/physx/documentation/PhysXAPI/files/PxArticulationJointReducedCoordinate_8h_source.html
index 35807eb05..d118ce623 100644
--- a/physx/documentation/PhysXAPI/files/PxArticulationJointReducedCoordinate_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxArticulationJointReducedCoordinate_8h_source.html
@@ -93,18 +93,18 @@
 <div class="ttc" id="namespacephysx_html_a727d2d8426e2a21ebbc522fa65c3f97a"><div class="ttname"><a href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">physx::PxReal</a></div><div class="ttdeci">float PxReal</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:78</div></div>
 <div class="ttc" id="classPxBase_html_addfc067afca2f67bda6c497b14424962"><div class="ttname"><a href="classPxBase.html#addfc067afca2f67bda6c497b14424962">PxBase::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *superClass) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxBase.h:178</div></div>
 <div class="ttc" id="classPxArticulationJointReducedCoordinate_html_a5b2618395498d66a0edaf9324a1ae5cb"><div class="ttname"><a href="classPxArticulationJointReducedCoordinate.html#a5b2618395498d66a0edaf9324a1ae5cb">PxArticulationJointReducedCoordinate::PxArticulationJointReducedCoordinate</a></div><div class="ttdeci">PX_INLINE PxArticulationJointReducedCoordinate(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJointReducedCoordinate.h:83</div></div>
-<div class="ttc" id="structPxArticulationMotion_html_af04dcaf188047e88858bb7352bdfe879"><div class="ttname"><a href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">PxArticulationMotion::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:88</div></div>
+<div class="ttc" id="structPxArticulationMotion_html_af04dcaf188047e88858bb7352bdfe879"><div class="ttname"><a href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">PxArticulationMotion::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:86</div></div>
 <div class="ttc" id="PxBase_8h_html"><div class="ttname"><a href="PxBase_8h.html">PxBase.h</a></div></div>
 <div class="ttc" id="PxPhysXConfig_8h_html"><div class="ttname"><a href="PxPhysXConfig_8h.html">PxPhysXConfig.h</a></div></div>
 <div class="ttc" id="group__common_html_gac1fb4b256a5d900d394e89db170a2b79"><div class="ttname"><a href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a></div><div class="ttdeci">PxU16 PxType</div><div class="ttdef"><b>Definition:</b> PxBase.h:49</div></div>
 <div class="ttc" id="PxConstraintDesc_8h_html_a9cf8670b7dde57314a7d3cfc5799c277"><div class="ttname"><a href="PxConstraintDesc_8h.html#a9cf8670b7dde57314a7d3cfc5799c277">stiffness</a></div><div class="ttdeci">PxReal stiffness</div><div class="ttdoc">spring parameter, for spring constraints </div><div class="ttdef"><b>Definition:</b> PxConstraintDesc.h:99</div></div>
 <div class="ttc" id="classPxArticulationJointReducedCoordinate_html_aaa894898e551037b25cc61eb93010911"><div class="ttname"><a href="classPxArticulationJointReducedCoordinate.html#aaa894898e551037b25cc61eb93010911">PxArticulationJointReducedCoordinate::~PxArticulationJointReducedCoordinate</a></div><div class="ttdeci">virtual ~PxArticulationJointReducedCoordinate()</div><div class="ttdef"><b>Definition:</b> PxArticulationJointReducedCoordinate.h:84</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:112</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:110</div></div>
 <div class="ttc" id="PxConstraintDesc_8h_html_adc081240b94af5fc9f010e0583263581"><div class="ttname"><a href="PxConstraintDesc_8h.html#adc081240b94af5fc9f010e0583263581">damping</a></div><div class="ttdeci">PxReal damping</div><div class="ttdoc">damping parameter, for spring constraints </div><div class="ttdef"><b>Definition:</b> PxConstraintDesc.h:100</div></div>
 <div class="ttc" id="PxArticulationJoint_8h_html"><div class="ttname"><a href="PxArticulationJoint_8h.html">PxArticulationJoint.h</a></div></div>
-<div class="ttc" id="structPxArticulationJointType_html_a0df5bb8217acae97e63421bedbfe0a0b"><div class="ttname"><a href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">PxArticulationJointType::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:101</div></div>
+<div class="ttc" id="structPxArticulationJointType_html_a0df5bb8217acae97e63421bedbfe0a0b"><div class="ttname"><a href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">PxArticulationJointType::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:99</div></div>
 <div class="ttc" id="classPxFlags_html"><div class="ttname"><a href="classPxFlags.html">PxFlags&lt; PxBaseFlag::Enum, PxU16 &gt;</a></div></div>
-<div class="ttc" id="structPxArticulationAxis_html_a5c835f1d727bfc738d1fc7a7595d7395"><div class="ttname"><a href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">PxArticulationAxis::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:72</div></div>
+<div class="ttc" id="structPxArticulationAxis_html_a5c835f1d727bfc738d1fc7a7595d7395"><div class="ttname"><a href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">PxArticulationAxis::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:70</div></div>
 <div class="ttc" id="classPxArticulationJointReducedCoordinate_html"><div class="ttname"><a href="classPxArticulationJointReducedCoordinate.html">PxArticulationJointReducedCoordinate</a></div><div class="ttdoc">a joint between two links in an articulation. </div><div class="ttdef"><b>Definition:</b> PxArticulationJointReducedCoordinate.h:55</div></div>
 <div class="ttc" id="group__foundation_html_gacb03347b642a2a5bdea1f9b305a6fbec"><div class="ttname"><a href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a></div><div class="ttdeci">#define PX_INLINE</div><div class="ttdef"><b>Definition:</b> PxPreprocessor.h:349</div></div>
 </div><!-- fragment --></div><!-- contents -->
diff --git a/physx/documentation/PhysXAPI/files/PxArticulationJoint_8h_source.html b/physx/documentation/PhysXAPI/files/PxArticulationJoint_8h_source.html
index 80ea82beb..db5a8977b 100644
--- a/physx/documentation/PhysXAPI/files/PxArticulationJoint_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxArticulationJoint_8h_source.html
@@ -86,42 +86,42 @@
 <div class="title">PxArticulationJoint.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="PxArticulationJoint_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_NX_ARTICULATION_JOINT</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_NX_ARTICULATION_JOINT</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxPhysXConfig_8h.html">PxPhysXConfig.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxBase_8h.html">common/PxBase.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;{</div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="keyword">class </span>PxArticulationJointImpl;</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00059"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html">   59</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;{</div><div class="line"><a name="l00061"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">   61</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">Enum</a></div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;    {</div><div class="line"><a name="l00063"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6">   63</a></span>&#160;        eNONE   = 0,</div><div class="line"><a name="l00064"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">   64</a></span>&#160;        eTARGET = 1,            <span class="comment">// use the quaternion as the drive target</span></div><div class="line"><a name="l00065"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372">   65</a></span>&#160;        eERROR  = 2,                <span class="comment">// use the vector part of the quaternion as the drive error.</span></div><div class="line"><a name="l00066"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e">   66</a></span>&#160;        eTARGETVELOCITY = 3</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;    };</div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;};</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;</div><div class="line"><a name="l00070"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html">   70</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxArticulationAxis.html">PxArticulationAxis</a></div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;{</div><div class="line"><a name="l00072"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">   72</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">Enum</a></div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;    {</div><div class="line"><a name="l00074"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a380a97841323f13390ef6cdc57f6265d">   74</a></span>&#160;        eTWIST = 0,</div><div class="line"><a name="l00075"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a2e1a274c036f8b7846476dbfe2015fec">   75</a></span>&#160;        eSWING1 = 1,</div><div class="line"><a name="l00076"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a33f154c8d454ab3f08f03f512cc03972">   76</a></span>&#160;        eSWING2 = 2,</div><div class="line"><a name="l00077"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a83f4dcfb4a4a87644225475a9e16c67e">   77</a></span>&#160;        eX = 3,</div><div class="line"><a name="l00078"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a582892dc65379497febf0d6c08dddc02">   78</a></span>&#160;        eY = 4,</div><div class="line"><a name="l00079"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a70a41042911d91bf76f507000adc09a4">   79</a></span>&#160;        eZ = 5,</div><div class="line"><a name="l00080"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a09384f4a095731741d93530048614de7">   80</a></span>&#160;        eCOUNT = 6</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;    };</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;};</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;PX_FLAGS_OPERATORS(<a class="code" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">PxArticulationAxis::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;</div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html">   86</a></span>&#160;struct <a class="code" href="structPxArticulationMotion.html">PxArticulationMotion</a></div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;{</div><div class="line"><a name="l00088"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">   88</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">Enum</a></div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;    {</div><div class="line"><a name="l00090"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879af640457f343a3cd57412d5aedf2b6808">   90</a></span>&#160;        eLOCKED = 0,</div><div class="line"><a name="l00091"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879a93e73f43c42672207a2abf9904d47a50">   91</a></span>&#160;        eLIMITED = 1,</div><div class="line"><a name="l00092"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879aa856908c1ad1720774753eee4ef7acae">   92</a></span>&#160;        eFREE = 2</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;    };</div><div class="line"><a name="l00094"></a><span class="lineno">   94</span>&#160;};</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;</div><div class="line"><a name="l00096"></a><span class="lineno"><a class="line" href="group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405">   96</a></span>&#160;<span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxArticulationMotion::Enum, PxU8&gt;</a> <a class="code" href="group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405">PxArticulationMotions</a>;</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;PX_FLAGS_OPERATORS(<a class="code" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">PxArticulationMotion::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;</div><div class="line"><a name="l00099"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html">   99</a></span>&#160;struct <a class="code" href="structPxArticulationJointType.html">PxArticulationJointType</a></div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;{</div><div class="line"><a name="l00101"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">  101</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">Enum</a></div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;    {</div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba20eb6c657a0b29898c2ab308b2e68ba3">  103</a></span>&#160;        ePRISMATIC = 0,</div><div class="line"><a name="l00104"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba9bb388357d6c57179a26508f00e7d2c7">  104</a></span>&#160;        eREVOLUTE = 1,</div><div class="line"><a name="l00105"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba6e0b21355b6023f41e8e5be799feee91">  105</a></span>&#160;        eSPHERICAL = 2,</div><div class="line"><a name="l00106"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba3a5aad862ed0a4ad0ce289d55ce08531">  106</a></span>&#160;        eFIX = 3,</div><div class="line"><a name="l00107"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0baa3b41bcb6b6b318f64d2dfb3073f277e">  107</a></span>&#160;        eUNDEFINED = 4</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;    };</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;};</div><div class="line"><a name="l00110"></a><span class="lineno">  110</span>&#160;</div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;</div><div class="line"><a name="l00112"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html">  112</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a> : <span class="keyword">public</span> <a class="code" href="classPxBase.html">PxBase</a></div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;{</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxArticulationLink.html">PxArticulationLink</a>&amp; getParentArticulationLink() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;</div><div class="line"><a name="l00131"></a><span class="lineno">  131</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setParentPose(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose) = 0;</div><div class="line"><a name="l00132"></a><span class="lineno">  132</span>&#160;</div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxTransform.html">PxTransform</a>     getParentPose() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxArticulationLink.html">PxArticulationLink</a>&amp; getChildArticulationLink() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;</div><div class="line"><a name="l00160"></a><span class="lineno">  160</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setChildPose(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose) = 0;</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxTransform.html">PxTransform</a>     getChildPose() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;    <span class="keyword">virtual</span>     PxArticulationJointImpl* getImpl() = 0;</div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;    <span class="keyword">virtual</span>     <span class="keyword">const</span> PxArticulationJointImpl* getImpl() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;</div><div class="line"><a name="l00174"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#afefce115db74c3bcfbc3f3052e18f14d">  174</a></span>&#160;    <span class="keyword">virtual</span>                     <a class="code" href="classPxArticulationJointBase.html#afefce115db74c3bcfbc3f3052e18f14d">~PxArticulationJointBase</a>() {}</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;<span class="keyword">private</span>:</div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00178"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#aac7d969924423782bca5c78f9b10ca04">  178</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJointBase.html#aac7d969924423782bca5c78f9b10ca04">PxArticulationJointBase</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxBase.html">PxBase</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00179"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#ac393f6100547e633015e60a0113d7dbf">  179</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJointBase.html#ac393f6100547e633015e60a0113d7dbf">PxArticulationJointBase</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxBase.html">PxBase</a>(baseFlags) {}</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;    </div><div class="line"><a name="l00181"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">  181</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            <a class="code" href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">isKindOf</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* name)<span class="keyword">  const </span>{ <span class="keywordflow">return</span> !::strcmp(<span class="stringliteral">&quot;PxArticulationJointBase&quot;</span>, name) || <a class="code" href="classPxBase.html#addfc067afca2f67bda6c497b14424962">PxBase::isKindOf</a>(name); }</div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;};</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;</div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;</div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;</div><div class="line"><a name="l00195"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html">  195</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxArticulationJoint.html">PxArticulationJoint</a> : <span class="keyword">public</span> <a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;{</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00198"></a><span class="lineno">  198</span>&#160;</div><div class="line"><a name="l00211"></a><span class="lineno">  211</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTargetOrientation(<span class="keyword">const</span> <a class="code" href="classPxQuat.html">PxQuat</a>&amp; orientation) = 0;</div><div class="line"><a name="l00212"></a><span class="lineno">  212</span>&#160;</div><div class="line"><a name="l00220"></a><span class="lineno">  220</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxQuat.html">PxQuat</a>          getTargetOrientation() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00221"></a><span class="lineno">  221</span>&#160;</div><div class="line"><a name="l00232"></a><span class="lineno">  232</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTargetVelocity(<span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; velocity) = 0;</div><div class="line"><a name="l00233"></a><span class="lineno">  233</span>&#160;</div><div class="line"><a name="l00241"></a><span class="lineno">  241</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxVec3.html">PxVec3</a>          getTargetVelocity() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00242"></a><span class="lineno">  242</span>&#160;</div><div class="line"><a name="l00243"></a><span class="lineno">  243</span>&#160;</div><div class="line"><a name="l00252"></a><span class="lineno">  252</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setDriveType(<a class="code" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">PxArticulationJointDriveType::Enum</a> driveType) = 0;</div><div class="line"><a name="l00253"></a><span class="lineno">  253</span>&#160;</div><div class="line"><a name="l00261"></a><span class="lineno">  261</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">PxArticulationJointDriveType::Enum</a></div><div class="line"><a name="l00262"></a><span class="lineno">  262</span>&#160;                                getDriveType() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00263"></a><span class="lineno">  263</span>&#160;</div><div class="line"><a name="l00264"></a><span class="lineno">  264</span>&#160;</div><div class="line"><a name="l00265"></a><span class="lineno">  265</span>&#160;    </div><div class="line"><a name="l00279"></a><span class="lineno">  279</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setStiffness(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#ab545718d2685b45ef4df86c181ca0baa">spring</a>) = 0;</div><div class="line"><a name="l00280"></a><span class="lineno">  280</span>&#160;</div><div class="line"><a name="l00288"></a><span class="lineno">  288</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getStiffness() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00289"></a><span class="lineno">  289</span>&#160;</div><div class="line"><a name="l00290"></a><span class="lineno">  290</span>&#160;</div><div class="line"><a name="l00304"></a><span class="lineno">  304</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setDamping(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#adc081240b94af5fc9f010e0583263581">damping</a>) = 0;</div><div class="line"><a name="l00305"></a><span class="lineno">  305</span>&#160;</div><div class="line"><a name="l00312"></a><span class="lineno">  312</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getDamping() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00313"></a><span class="lineno">  313</span>&#160;</div><div class="line"><a name="l00333"></a><span class="lineno">  333</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setInternalCompliance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> compliance) = 0;</div><div class="line"><a name="l00334"></a><span class="lineno">  334</span>&#160;</div><div class="line"><a name="l00335"></a><span class="lineno">  335</span>&#160;</div><div class="line"><a name="l00343"></a><span class="lineno">  343</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getInternalCompliance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00344"></a><span class="lineno">  344</span>&#160;</div><div class="line"><a name="l00364"></a><span class="lineno">  364</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setExternalCompliance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> compliance) = 0;</div><div class="line"><a name="l00365"></a><span class="lineno">  365</span>&#160;</div><div class="line"><a name="l00373"></a><span class="lineno">  373</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getExternalCompliance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00374"></a><span class="lineno">  374</span>&#160;</div><div class="line"><a name="l00375"></a><span class="lineno">  375</span>&#160;</div><div class="line"><a name="l00376"></a><span class="lineno">  376</span>&#160;</div><div class="line"><a name="l00390"></a><span class="lineno">  390</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setSwingLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> zLimit, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> yLimit) = 0;</div><div class="line"><a name="l00391"></a><span class="lineno">  391</span>&#160;</div><div class="line"><a name="l00392"></a><span class="lineno">  392</span>&#160;</div><div class="line"><a name="l00403"></a><span class="lineno">  403</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            getSwingLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>&amp; zLimit, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>&amp; yLimit) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00404"></a><span class="lineno">  404</span>&#160;</div><div class="line"><a name="l00405"></a><span class="lineno">  405</span>&#160;</div><div class="line"><a name="l00406"></a><span class="lineno">  406</span>&#160;</div><div class="line"><a name="l00413"></a><span class="lineno">  413</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTangentialStiffness(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#ab545718d2685b45ef4df86c181ca0baa">spring</a>) = 0;</div><div class="line"><a name="l00414"></a><span class="lineno">  414</span>&#160;</div><div class="line"><a name="l00415"></a><span class="lineno">  415</span>&#160;</div><div class="line"><a name="l00423"></a><span class="lineno">  423</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getTangentialStiffness() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00424"></a><span class="lineno">  424</span>&#160;</div><div class="line"><a name="l00425"></a><span class="lineno">  425</span>&#160;</div><div class="line"><a name="l00432"></a><span class="lineno">  432</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTangentialDamping(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#adc081240b94af5fc9f010e0583263581">damping</a>) = 0;</div><div class="line"><a name="l00433"></a><span class="lineno">  433</span>&#160;</div><div class="line"><a name="l00434"></a><span class="lineno">  434</span>&#160;</div><div class="line"><a name="l00442"></a><span class="lineno">  442</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getTangentialDamping() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00443"></a><span class="lineno">  443</span>&#160;</div><div class="line"><a name="l00444"></a><span class="lineno">  444</span>&#160;</div><div class="line"><a name="l00456"></a><span class="lineno">  456</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setSwingLimitContactDistance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> contactDistance) = 0;</div><div class="line"><a name="l00457"></a><span class="lineno">  457</span>&#160;</div><div class="line"><a name="l00458"></a><span class="lineno">  458</span>&#160;</div><div class="line"><a name="l00466"></a><span class="lineno">  466</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getSwingLimitContactDistance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00467"></a><span class="lineno">  467</span>&#160;</div><div class="line"><a name="l00468"></a><span class="lineno">  468</span>&#160;</div><div class="line"><a name="l00469"></a><span class="lineno">  469</span>&#160;</div><div class="line"><a name="l00478"></a><span class="lineno">  478</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setSwingLimitEnabled(<span class="keywordtype">bool</span> enabled) = 0;</div><div class="line"><a name="l00479"></a><span class="lineno">  479</span>&#160;</div><div class="line"><a name="l00488"></a><span class="lineno">  488</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            getSwingLimitEnabled() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00489"></a><span class="lineno">  489</span>&#160;</div><div class="line"><a name="l00490"></a><span class="lineno">  490</span>&#160;</div><div class="line"><a name="l00503"></a><span class="lineno">  503</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTwistLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> lower, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> upper) = 0;</div><div class="line"><a name="l00504"></a><span class="lineno">  504</span>&#160;</div><div class="line"><a name="l00514"></a><span class="lineno">  514</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            getTwistLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> &amp;lower, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> &amp;upper) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00515"></a><span class="lineno">  515</span>&#160;</div><div class="line"><a name="l00524"></a><span class="lineno">  524</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTwistLimitEnabled(<span class="keywordtype">bool</span> enabled) = 0;</div><div class="line"><a name="l00525"></a><span class="lineno">  525</span>&#160;</div><div class="line"><a name="l00534"></a><span class="lineno">  534</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            getTwistLimitEnabled() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00535"></a><span class="lineno">  535</span>&#160;</div><div class="line"><a name="l00536"></a><span class="lineno">  536</span>&#160;</div><div class="line"><a name="l00548"></a><span class="lineno">  548</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTwistLimitContactDistance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> contactDistance) = 0;</div><div class="line"><a name="l00549"></a><span class="lineno">  549</span>&#160;</div><div class="line"><a name="l00550"></a><span class="lineno">  550</span>&#160;</div><div class="line"><a name="l00558"></a><span class="lineno">  558</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getTwistLimitContactDistance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00559"></a><span class="lineno">  559</span>&#160;</div><div class="line"><a name="l00560"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a8516e8495e829c17c190bf1a013d6f7a">  560</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keyword">const</span> <span class="keywordtype">char</span>*     <a class="code" href="classPxArticulationJoint.html#a8516e8495e829c17c190bf1a013d6f7a">getConcreteTypeName</a>()<span class="keyword"> const                 </span>{   <span class="keywordflow">return</span> <span class="stringliteral">&quot;PxArticulationJoint&quot;</span>; }</div><div class="line"><a name="l00561"></a><span class="lineno">  561</span>&#160;</div><div class="line"><a name="l00562"></a><span class="lineno">  562</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00563"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a8ecb6fb89ec7e18314cfc9c6b255bd41">  563</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJoint.html#a8ecb6fb89ec7e18314cfc9c6b255bd41">PxArticulationJoint</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00564"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a830694ba6d6b8bbe5dfdc71f446bac54">  564</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJoint.html#a830694ba6d6b8bbe5dfdc71f446bac54">PxArticulationJoint</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a>(baseFlags) {}</div><div class="line"><a name="l00565"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a2a73babb0e7aa037d07132980281f302">  565</a></span>&#160;    <span class="keyword">virtual</span>                     <a class="code" href="classPxArticulationJoint.html#a2a73babb0e7aa037d07132980281f302">~PxArticulationJoint</a>() {}</div><div class="line"><a name="l00566"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a9c6e996a3fbd7abf27ab8e3cabe65d38">  566</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            <a class="code" href="classPxArticulationJoint.html#a9c6e996a3fbd7abf27ab8e3cabe65d38">isKindOf</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* name)<span class="keyword">  const       </span>{   <span class="keywordflow">return</span> !::strcmp(<span class="stringliteral">&quot;PxArticulationJoint&quot;</span>, name) || <a class="code" href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">PxArticulationJointBase::isKindOf</a>(name); }</div><div class="line"><a name="l00567"></a><span class="lineno">  567</span>&#160;};</div><div class="line"><a name="l00568"></a><span class="lineno">  568</span>&#160;</div><div class="line"><a name="l00569"></a><span class="lineno">  569</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00570"></a><span class="lineno">  570</span>&#160;} <span class="comment">// namespace physx</span></div><div class="line"><a name="l00571"></a><span class="lineno">  571</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00572"></a><span class="lineno">  572</span>&#160;</div><div class="line"><a name="l00574"></a><span class="lineno">  574</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
-<div class="ttc" id="classPxArticulationJoint_html_a9c6e996a3fbd7abf27ab8e3cabe65d38"><div class="ttname"><a href="classPxArticulationJoint.html#a9c6e996a3fbd7abf27ab8e3cabe65d38">PxArticulationJoint::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *name) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:566</div></div>
-<div class="ttc" id="classPxArticulationJoint_html_a2a73babb0e7aa037d07132980281f302"><div class="ttname"><a href="classPxArticulationJoint.html#a2a73babb0e7aa037d07132980281f302">PxArticulationJoint::~PxArticulationJoint</a></div><div class="ttdeci">virtual ~PxArticulationJoint()</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:565</div></div>
-<div class="ttc" id="classPxArticulationJoint_html"><div class="ttname"><a href="classPxArticulationJoint.html">PxArticulationJoint</a></div><div class="ttdoc">a joint between two links in an articulation. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:195</div></div>
-<div class="ttc" id="structPxArticulationAxis_html"><div class="ttname"><a href="structPxArticulationAxis.html">PxArticulationAxis</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:70</div></div>
+<a href="PxArticulationJoint_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_NX_ARTICULATION_JOINT</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_NX_ARTICULATION_JOINT</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxPhysXConfig_8h.html">PxPhysXConfig.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxBase_8h.html">common/PxBase.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;{</div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="keyword">class </span>PxArticulationJointImpl;</div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00059"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html">   59</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;{</div><div class="line"><a name="l00061"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">   61</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">Enum</a></div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;    {</div><div class="line"><a name="l00063"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">   63</a></span>&#160;        eTARGET = 0,            <span class="comment">// use the quaternion as the drive target</span></div><div class="line"><a name="l00064"></a><span class="lineno"><a class="line" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372">   64</a></span>&#160;        eERROR  = 1             <span class="comment">// use the vector part of the quaternion as the drive error.</span></div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;    };</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;};</div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;</div><div class="line"><a name="l00068"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html">   68</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxArticulationAxis.html">PxArticulationAxis</a></div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;{</div><div class="line"><a name="l00070"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">   70</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">Enum</a></div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;    {</div><div class="line"><a name="l00072"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a380a97841323f13390ef6cdc57f6265d">   72</a></span>&#160;        eTWIST = 0,</div><div class="line"><a name="l00073"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a2e1a274c036f8b7846476dbfe2015fec">   73</a></span>&#160;        eSWING1 = 1,</div><div class="line"><a name="l00074"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a33f154c8d454ab3f08f03f512cc03972">   74</a></span>&#160;        eSWING2 = 2,</div><div class="line"><a name="l00075"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a83f4dcfb4a4a87644225475a9e16c67e">   75</a></span>&#160;        eX = 3,</div><div class="line"><a name="l00076"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a582892dc65379497febf0d6c08dddc02">   76</a></span>&#160;        eY = 4,</div><div class="line"><a name="l00077"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a70a41042911d91bf76f507000adc09a4">   77</a></span>&#160;        eZ = 5,</div><div class="line"><a name="l00078"></a><span class="lineno"><a class="line" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a09384f4a095731741d93530048614de7">   78</a></span>&#160;        eCOUNT = 6</div><div class="line"><a name="l00079"></a><span class="lineno">   79</span>&#160;    };</div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;};</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;</div><div class="line"><a name="l00082"></a><span class="lineno">   82</span>&#160;PX_FLAGS_OPERATORS(<a class="code" href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">PxArticulationAxis::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00083"></a><span class="lineno">   83</span>&#160;</div><div class="line"><a name="l00084"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html">   84</a></span>&#160;struct <a class="code" href="structPxArticulationMotion.html">PxArticulationMotion</a></div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;{</div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">   86</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">Enum</a></div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;    {</div><div class="line"><a name="l00088"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879af640457f343a3cd57412d5aedf2b6808">   88</a></span>&#160;        eLOCKED = 0,</div><div class="line"><a name="l00089"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879a93e73f43c42672207a2abf9904d47a50">   89</a></span>&#160;        eLIMITED = 1,</div><div class="line"><a name="l00090"></a><span class="lineno"><a class="line" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879aa856908c1ad1720774753eee4ef7acae">   90</a></span>&#160;        eFREE = 2</div><div class="line"><a name="l00091"></a><span class="lineno">   91</span>&#160;    };</div><div class="line"><a name="l00092"></a><span class="lineno">   92</span>&#160;};</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;</div><div class="line"><a name="l00094"></a><span class="lineno"><a class="line" href="group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405">   94</a></span>&#160;<span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxArticulationMotion::Enum, PxU8&gt;</a> <a class="code" href="group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405">PxArticulationMotions</a>;</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;PX_FLAGS_OPERATORS(<a class="code" href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">PxArticulationMotion::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;</div><div class="line"><a name="l00097"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html">   97</a></span>&#160;struct <a class="code" href="structPxArticulationJointType.html">PxArticulationJointType</a></div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;{</div><div class="line"><a name="l00099"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">   99</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">Enum</a></div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;    {</div><div class="line"><a name="l00101"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba20eb6c657a0b29898c2ab308b2e68ba3">  101</a></span>&#160;        ePRISMATIC = 0,</div><div class="line"><a name="l00102"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba9bb388357d6c57179a26508f00e7d2c7">  102</a></span>&#160;        eREVOLUTE = 1,</div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba6e0b21355b6023f41e8e5be799feee91">  103</a></span>&#160;        eSPHERICAL = 2,</div><div class="line"><a name="l00104"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba3a5aad862ed0a4ad0ce289d55ce08531">  104</a></span>&#160;        eFIX = 3,</div><div class="line"><a name="l00105"></a><span class="lineno"><a class="line" href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0baa3b41bcb6b6b318f64d2dfb3073f277e">  105</a></span>&#160;        eUNDEFINED = 4</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;    };</div><div class="line"><a name="l00107"></a><span class="lineno">  107</span>&#160;};</div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;</div><div class="line"><a name="l00110"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html">  110</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a> : <span class="keyword">public</span> <a class="code" href="classPxBase.html">PxBase</a></div><div class="line"><a name="l00111"></a><span class="lineno">  111</span>&#160;{</div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxArticulationLink.html">PxArticulationLink</a>&amp; getParentArticulationLink() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;</div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setParentPose(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose) = 0;</div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxTransform.html">PxTransform</a>     getParentPose() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxArticulationLink.html">PxArticulationLink</a>&amp; getChildArticulationLink() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setChildPose(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose) = 0;</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;</div><div class="line"><a name="l00167"></a><span class="lineno">  167</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxTransform.html">PxTransform</a>     getChildPose() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00168"></a><span class="lineno">  168</span>&#160;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;    <span class="keyword">virtual</span>     PxArticulationJointImpl* getImpl() = 0;</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;    <span class="keyword">virtual</span>     <span class="keyword">const</span> PxArticulationJointImpl* getImpl() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;</div><div class="line"><a name="l00172"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#afefce115db74c3bcfbc3f3052e18f14d">  172</a></span>&#160;    <span class="keyword">virtual</span>                     <a class="code" href="classPxArticulationJointBase.html#afefce115db74c3bcfbc3f3052e18f14d">~PxArticulationJointBase</a>() {}</div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;</div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;<span class="keyword">private</span>:</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00176"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#aac7d969924423782bca5c78f9b10ca04">  176</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJointBase.html#aac7d969924423782bca5c78f9b10ca04">PxArticulationJointBase</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxBase.html">PxBase</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00177"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#ac393f6100547e633015e60a0113d7dbf">  177</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJointBase.html#ac393f6100547e633015e60a0113d7dbf">PxArticulationJointBase</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxBase.html">PxBase</a>(baseFlags) {}</div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;    </div><div class="line"><a name="l00179"></a><span class="lineno"><a class="line" href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">  179</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            <a class="code" href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">isKindOf</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* name)<span class="keyword">  const </span>{ <span class="keywordflow">return</span> !::strcmp(<span class="stringliteral">&quot;PxArticulationJointBase&quot;</span>, name) || <a class="code" href="classPxBase.html#addfc067afca2f67bda6c497b14424962">PxBase::isKindOf</a>(name); }</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;};</div><div class="line"><a name="l00181"></a><span class="lineno">  181</span>&#160;</div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;</div><div class="line"><a name="l00193"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html">  193</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxArticulationJoint.html">PxArticulationJoint</a> : <span class="keyword">public</span> <a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;{</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;</div><div class="line"><a name="l00209"></a><span class="lineno">  209</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTargetOrientation(<span class="keyword">const</span> <a class="code" href="classPxQuat.html">PxQuat</a>&amp; orientation) = 0;</div><div class="line"><a name="l00210"></a><span class="lineno">  210</span>&#160;</div><div class="line"><a name="l00218"></a><span class="lineno">  218</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxQuat.html">PxQuat</a>          getTargetOrientation() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;</div><div class="line"><a name="l00230"></a><span class="lineno">  230</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTargetVelocity(<span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; velocity) = 0;</div><div class="line"><a name="l00231"></a><span class="lineno">  231</span>&#160;</div><div class="line"><a name="l00239"></a><span class="lineno">  239</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxVec3.html">PxVec3</a>          getTargetVelocity() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00240"></a><span class="lineno">  240</span>&#160;</div><div class="line"><a name="l00241"></a><span class="lineno">  241</span>&#160;</div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setDriveType(<a class="code" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">PxArticulationJointDriveType::Enum</a> driveType) = 0;</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;</div><div class="line"><a name="l00259"></a><span class="lineno">  259</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">PxArticulationJointDriveType::Enum</a></div><div class="line"><a name="l00260"></a><span class="lineno">  260</span>&#160;                                getDriveType() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00261"></a><span class="lineno">  261</span>&#160;</div><div class="line"><a name="l00262"></a><span class="lineno">  262</span>&#160;</div><div class="line"><a name="l00263"></a><span class="lineno">  263</span>&#160;    </div><div class="line"><a name="l00277"></a><span class="lineno">  277</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setStiffness(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#ab545718d2685b45ef4df86c181ca0baa">spring</a>) = 0;</div><div class="line"><a name="l00278"></a><span class="lineno">  278</span>&#160;</div><div class="line"><a name="l00286"></a><span class="lineno">  286</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getStiffness() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00287"></a><span class="lineno">  287</span>&#160;</div><div class="line"><a name="l00288"></a><span class="lineno">  288</span>&#160;</div><div class="line"><a name="l00302"></a><span class="lineno">  302</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setDamping(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#adc081240b94af5fc9f010e0583263581">damping</a>) = 0;</div><div class="line"><a name="l00303"></a><span class="lineno">  303</span>&#160;</div><div class="line"><a name="l00310"></a><span class="lineno">  310</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getDamping() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00311"></a><span class="lineno">  311</span>&#160;</div><div class="line"><a name="l00331"></a><span class="lineno">  331</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setInternalCompliance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> compliance) = 0;</div><div class="line"><a name="l00332"></a><span class="lineno">  332</span>&#160;</div><div class="line"><a name="l00333"></a><span class="lineno">  333</span>&#160;</div><div class="line"><a name="l00341"></a><span class="lineno">  341</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getInternalCompliance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00342"></a><span class="lineno">  342</span>&#160;</div><div class="line"><a name="l00362"></a><span class="lineno">  362</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setExternalCompliance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> compliance) = 0;</div><div class="line"><a name="l00363"></a><span class="lineno">  363</span>&#160;</div><div class="line"><a name="l00371"></a><span class="lineno">  371</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getExternalCompliance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00372"></a><span class="lineno">  372</span>&#160;</div><div class="line"><a name="l00373"></a><span class="lineno">  373</span>&#160;</div><div class="line"><a name="l00374"></a><span class="lineno">  374</span>&#160;</div><div class="line"><a name="l00388"></a><span class="lineno">  388</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setSwingLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> zLimit, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> yLimit) = 0;</div><div class="line"><a name="l00389"></a><span class="lineno">  389</span>&#160;</div><div class="line"><a name="l00390"></a><span class="lineno">  390</span>&#160;</div><div class="line"><a name="l00401"></a><span class="lineno">  401</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            getSwingLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>&amp; zLimit, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>&amp; yLimit) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00402"></a><span class="lineno">  402</span>&#160;</div><div class="line"><a name="l00403"></a><span class="lineno">  403</span>&#160;</div><div class="line"><a name="l00404"></a><span class="lineno">  404</span>&#160;</div><div class="line"><a name="l00411"></a><span class="lineno">  411</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTangentialStiffness(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#ab545718d2685b45ef4df86c181ca0baa">spring</a>) = 0;</div><div class="line"><a name="l00412"></a><span class="lineno">  412</span>&#160;</div><div class="line"><a name="l00413"></a><span class="lineno">  413</span>&#160;</div><div class="line"><a name="l00421"></a><span class="lineno">  421</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getTangentialStiffness() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00422"></a><span class="lineno">  422</span>&#160;</div><div class="line"><a name="l00423"></a><span class="lineno">  423</span>&#160;</div><div class="line"><a name="l00430"></a><span class="lineno">  430</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTangentialDamping(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="PxConstraintDesc_8h.html#adc081240b94af5fc9f010e0583263581">damping</a>) = 0;</div><div class="line"><a name="l00431"></a><span class="lineno">  431</span>&#160;</div><div class="line"><a name="l00432"></a><span class="lineno">  432</span>&#160;</div><div class="line"><a name="l00440"></a><span class="lineno">  440</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getTangentialDamping() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00441"></a><span class="lineno">  441</span>&#160;</div><div class="line"><a name="l00442"></a><span class="lineno">  442</span>&#160;</div><div class="line"><a name="l00454"></a><span class="lineno">  454</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setSwingLimitContactDistance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> contactDistance) = 0;</div><div class="line"><a name="l00455"></a><span class="lineno">  455</span>&#160;</div><div class="line"><a name="l00456"></a><span class="lineno">  456</span>&#160;</div><div class="line"><a name="l00464"></a><span class="lineno">  464</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getSwingLimitContactDistance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00465"></a><span class="lineno">  465</span>&#160;</div><div class="line"><a name="l00466"></a><span class="lineno">  466</span>&#160;</div><div class="line"><a name="l00467"></a><span class="lineno">  467</span>&#160;</div><div class="line"><a name="l00476"></a><span class="lineno">  476</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setSwingLimitEnabled(<span class="keywordtype">bool</span> enabled) = 0;</div><div class="line"><a name="l00477"></a><span class="lineno">  477</span>&#160;</div><div class="line"><a name="l00486"></a><span class="lineno">  486</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            getSwingLimitEnabled() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00487"></a><span class="lineno">  487</span>&#160;</div><div class="line"><a name="l00488"></a><span class="lineno">  488</span>&#160;</div><div class="line"><a name="l00501"></a><span class="lineno">  501</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTwistLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> lower, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> upper) = 0;</div><div class="line"><a name="l00502"></a><span class="lineno">  502</span>&#160;</div><div class="line"><a name="l00512"></a><span class="lineno">  512</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            getTwistLimit(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> &amp;lower, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> &amp;upper) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00513"></a><span class="lineno">  513</span>&#160;</div><div class="line"><a name="l00522"></a><span class="lineno">  522</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTwistLimitEnabled(<span class="keywordtype">bool</span> enabled) = 0;</div><div class="line"><a name="l00523"></a><span class="lineno">  523</span>&#160;</div><div class="line"><a name="l00532"></a><span class="lineno">  532</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            getTwistLimitEnabled() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00533"></a><span class="lineno">  533</span>&#160;</div><div class="line"><a name="l00534"></a><span class="lineno">  534</span>&#160;</div><div class="line"><a name="l00546"></a><span class="lineno">  546</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>            setTwistLimitContactDistance(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> contactDistance) = 0;</div><div class="line"><a name="l00547"></a><span class="lineno">  547</span>&#160;</div><div class="line"><a name="l00548"></a><span class="lineno">  548</span>&#160;</div><div class="line"><a name="l00556"></a><span class="lineno">  556</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          getTwistLimitContactDistance() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00557"></a><span class="lineno">  557</span>&#160;</div><div class="line"><a name="l00558"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a8516e8495e829c17c190bf1a013d6f7a">  558</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keyword">const</span> <span class="keywordtype">char</span>*     <a class="code" href="classPxArticulationJoint.html#a8516e8495e829c17c190bf1a013d6f7a">getConcreteTypeName</a>()<span class="keyword"> const                 </span>{   <span class="keywordflow">return</span> <span class="stringliteral">&quot;PxArticulationJoint&quot;</span>; }</div><div class="line"><a name="l00559"></a><span class="lineno">  559</span>&#160;</div><div class="line"><a name="l00560"></a><span class="lineno">  560</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00561"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a8ecb6fb89ec7e18314cfc9c6b255bd41">  561</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJoint.html#a8ecb6fb89ec7e18314cfc9c6b255bd41">PxArticulationJoint</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00562"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a830694ba6d6b8bbe5dfdc71f446bac54">  562</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationJoint.html#a830694ba6d6b8bbe5dfdc71f446bac54">PxArticulationJoint</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationJointBase.html">PxArticulationJointBase</a>(baseFlags) {}</div><div class="line"><a name="l00563"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a2a73babb0e7aa037d07132980281f302">  563</a></span>&#160;    <span class="keyword">virtual</span>                     <a class="code" href="classPxArticulationJoint.html#a2a73babb0e7aa037d07132980281f302">~PxArticulationJoint</a>() {}</div><div class="line"><a name="l00564"></a><span class="lineno"><a class="line" href="classPxArticulationJoint.html#a9c6e996a3fbd7abf27ab8e3cabe65d38">  564</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>            <a class="code" href="classPxArticulationJoint.html#a9c6e996a3fbd7abf27ab8e3cabe65d38">isKindOf</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* name)<span class="keyword">  const       </span>{   <span class="keywordflow">return</span> !::strcmp(<span class="stringliteral">&quot;PxArticulationJoint&quot;</span>, name) || <a class="code" href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">PxArticulationJointBase::isKindOf</a>(name); }</div><div class="line"><a name="l00565"></a><span class="lineno">  565</span>&#160;};</div><div class="line"><a name="l00566"></a><span class="lineno">  566</span>&#160;</div><div class="line"><a name="l00567"></a><span class="lineno">  567</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00568"></a><span class="lineno">  568</span>&#160;} <span class="comment">// namespace physx</span></div><div class="line"><a name="l00569"></a><span class="lineno">  569</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00570"></a><span class="lineno">  570</span>&#160;</div><div class="line"><a name="l00572"></a><span class="lineno">  572</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
+<div class="ttc" id="classPxArticulationJoint_html_a9c6e996a3fbd7abf27ab8e3cabe65d38"><div class="ttname"><a href="classPxArticulationJoint.html#a9c6e996a3fbd7abf27ab8e3cabe65d38">PxArticulationJoint::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *name) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:564</div></div>
+<div class="ttc" id="classPxArticulationJoint_html_a2a73babb0e7aa037d07132980281f302"><div class="ttname"><a href="classPxArticulationJoint.html#a2a73babb0e7aa037d07132980281f302">PxArticulationJoint::~PxArticulationJoint</a></div><div class="ttdeci">virtual ~PxArticulationJoint()</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:563</div></div>
+<div class="ttc" id="classPxArticulationJoint_html"><div class="ttname"><a href="classPxArticulationJoint.html">PxArticulationJoint</a></div><div class="ttdoc">a joint between two links in an articulation. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:193</div></div>
+<div class="ttc" id="structPxArticulationAxis_html"><div class="ttname"><a href="structPxArticulationAxis.html">PxArticulationAxis</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:68</div></div>
 <div class="ttc" id="classPxQuat_html"><div class="ttname"><a href="classPxQuat.html">PxQuat</a></div><div class="ttdoc">This is a quaternion class. For more information on quaternion mathematics consult a mathematics sour...</div><div class="ttdef"><b>Definition:</b> PxQuat.h:49</div></div>
 <div class="ttc" id="namespacephysx_html_a727d2d8426e2a21ebbc522fa65c3f97a"><div class="ttname"><a href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">physx::PxReal</a></div><div class="ttdeci">float PxReal</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:78</div></div>
 <div class="ttc" id="PxConstraintDesc_8h_html_ab545718d2685b45ef4df86c181ca0baa"><div class="ttname"><a href="PxConstraintDesc_8h.html#ab545718d2685b45ef4df86c181ca0baa">spring</a></div><div class="ttdeci">struct @6::SpringModifiers spring</div></div>
 <div class="ttc" id="classPxBase_html_addfc067afca2f67bda6c497b14424962"><div class="ttname"><a href="classPxBase.html#addfc067afca2f67bda6c497b14424962">PxBase::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *superClass) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxBase.h:178</div></div>
 <div class="ttc" id="structPxArticulationJointDriveType_html"><div class="ttname"><a href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></div><div class="ttdoc">The type of joint drive to use for the articulation joint. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:59</div></div>
-<div class="ttc" id="structPxArticulationMotion_html_af04dcaf188047e88858bb7352bdfe879"><div class="ttname"><a href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">PxArticulationMotion::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:88</div></div>
+<div class="ttc" id="structPxArticulationMotion_html_af04dcaf188047e88858bb7352bdfe879"><div class="ttname"><a href="structPxArticulationMotion.html#af04dcaf188047e88858bb7352bdfe879">PxArticulationMotion::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:86</div></div>
 <div class="ttc" id="PxBase_8h_html"><div class="ttname"><a href="PxBase_8h.html">PxBase.h</a></div></div>
 <div class="ttc" id="PxPhysXConfig_8h_html"><div class="ttname"><a href="PxPhysXConfig_8h.html">PxPhysXConfig.h</a></div></div>
 <div class="ttc" id="group__common_html_gac1fb4b256a5d900d394e89db170a2b79"><div class="ttname"><a href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a></div><div class="ttdeci">PxU16 PxType</div><div class="ttdef"><b>Definition:</b> PxBase.h:49</div></div>
-<div class="ttc" id="structPxArticulationJointType_html"><div class="ttname"><a href="structPxArticulationJointType.html">PxArticulationJointType</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:99</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html_ae0437f39b24665d7f8c49d258eea8d0d"><div class="ttname"><a href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">PxArticulationJointBase::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *name) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:181</div></div>
-<div class="ttc" id="classPxArticulationJoint_html_a8516e8495e829c17c190bf1a013d6f7a"><div class="ttname"><a href="classPxArticulationJoint.html#a8516e8495e829c17c190bf1a013d6f7a">PxArticulationJoint::getConcreteTypeName</a></div><div class="ttdeci">virtual const char * getConcreteTypeName() const</div><div class="ttdoc">Returns string name of dynamic type. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:560</div></div>
+<div class="ttc" id="structPxArticulationJointType_html"><div class="ttname"><a href="structPxArticulationJointType.html">PxArticulationJointType</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:97</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html_ae0437f39b24665d7f8c49d258eea8d0d"><div class="ttname"><a href="classPxArticulationJointBase.html#ae0437f39b24665d7f8c49d258eea8d0d">PxArticulationJointBase::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *name) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:179</div></div>
+<div class="ttc" id="classPxArticulationJoint_html_a8516e8495e829c17c190bf1a013d6f7a"><div class="ttname"><a href="classPxArticulationJoint.html#a8516e8495e829c17c190bf1a013d6f7a">PxArticulationJoint::getConcreteTypeName</a></div><div class="ttdeci">virtual const char * getConcreteTypeName() const</div><div class="ttdoc">Returns string name of dynamic type. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:558</div></div>
 <div class="ttc" id="classPxArticulationLink_html"><div class="ttname"><a href="classPxArticulationLink.html">PxArticulationLink</a></div><div class="ttdoc">a component of an articulation that represents a rigid body </div><div class="ttdef"><b>Definition:</b> PxArticulationLink.h:57</div></div>
-<div class="ttc" id="group__physics_html_ga4d2907efdef5e5b7b1becd1ab59c2405"><div class="ttname"><a href="group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405">PxArticulationMotions</a></div><div class="ttdeci">PxFlags&lt; PxArticulationMotion::Enum, PxU8 &gt; PxArticulationMotions</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:96</div></div>
+<div class="ttc" id="group__physics_html_ga4d2907efdef5e5b7b1becd1ab59c2405"><div class="ttname"><a href="group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405">PxArticulationMotions</a></div><div class="ttdeci">PxFlags&lt; PxArticulationMotion::Enum, PxU8 &gt; PxArticulationMotions</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:94</div></div>
 <div class="ttc" id="structPxArticulationJointDriveType_html_ac6be2c7afe87dd7755dae4aa7a0fdfe2"><div class="ttname"><a href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">PxArticulationJointDriveType::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:61</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:112</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:110</div></div>
 <div class="ttc" id="namespacephysx_html_a3849f86abc21d3a58949481603fe8309"><div class="ttname"><a href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">physx::PxU8</a></div><div class="ttdeci">uint8_t PxU8</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:75</div></div>
 <div class="ttc" id="PxConstraintDesc_8h_html_adc081240b94af5fc9f010e0583263581"><div class="ttname"><a href="PxConstraintDesc_8h.html#adc081240b94af5fc9f010e0583263581">damping</a></div><div class="ttdeci">PxReal damping</div><div class="ttdoc">damping parameter, for spring constraints </div><div class="ttdef"><b>Definition:</b> PxConstraintDesc.h:100</div></div>
 <div class="ttc" id="classPxTransform_html"><div class="ttname"><a href="classPxTransform.html">PxTransform</a></div><div class="ttdoc">class representing a rigid euclidean transform as a quaternion and a vector </div><div class="ttdef"><b>Definition:</b> PxTransform.h:48</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html_ac393f6100547e633015e60a0113d7dbf"><div class="ttname"><a href="classPxArticulationJointBase.html#ac393f6100547e633015e60a0113d7dbf">PxArticulationJointBase::PxArticulationJointBase</a></div><div class="ttdeci">PX_INLINE PxArticulationJointBase(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:179</div></div>
-<div class="ttc" id="classPxArticulationJoint_html_a8ecb6fb89ec7e18314cfc9c6b255bd41"><div class="ttname"><a href="classPxArticulationJoint.html#a8ecb6fb89ec7e18314cfc9c6b255bd41">PxArticulationJoint::PxArticulationJoint</a></div><div class="ttdeci">PX_INLINE PxArticulationJoint(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:563</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html_afefce115db74c3bcfbc3f3052e18f14d"><div class="ttname"><a href="classPxArticulationJointBase.html#afefce115db74c3bcfbc3f3052e18f14d">PxArticulationJointBase::~PxArticulationJointBase</a></div><div class="ttdeci">virtual ~PxArticulationJointBase()</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:174</div></div>
-<div class="ttc" id="structPxArticulationJointType_html_a0df5bb8217acae97e63421bedbfe0a0b"><div class="ttname"><a href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">PxArticulationJointType::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:101</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html_ac393f6100547e633015e60a0113d7dbf"><div class="ttname"><a href="classPxArticulationJointBase.html#ac393f6100547e633015e60a0113d7dbf">PxArticulationJointBase::PxArticulationJointBase</a></div><div class="ttdeci">PX_INLINE PxArticulationJointBase(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:177</div></div>
+<div class="ttc" id="classPxArticulationJoint_html_a8ecb6fb89ec7e18314cfc9c6b255bd41"><div class="ttname"><a href="classPxArticulationJoint.html#a8ecb6fb89ec7e18314cfc9c6b255bd41">PxArticulationJoint::PxArticulationJoint</a></div><div class="ttdeci">PX_INLINE PxArticulationJoint(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:561</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html_afefce115db74c3bcfbc3f3052e18f14d"><div class="ttname"><a href="classPxArticulationJointBase.html#afefce115db74c3bcfbc3f3052e18f14d">PxArticulationJointBase::~PxArticulationJointBase</a></div><div class="ttdeci">virtual ~PxArticulationJointBase()</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:172</div></div>
+<div class="ttc" id="structPxArticulationJointType_html_a0df5bb8217acae97e63421bedbfe0a0b"><div class="ttname"><a href="structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b">PxArticulationJointType::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:99</div></div>
 <div class="ttc" id="classPxFlags_html"><div class="ttname"><a href="classPxFlags.html">PxFlags</a></div><div class="ttdoc">Container for bitfield flag variables associated with a specific enum type. </div><div class="ttdef"><b>Definition:</b> PxFlags.h:73</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html_aac7d969924423782bca5c78f9b10ca04"><div class="ttname"><a href="classPxArticulationJointBase.html#aac7d969924423782bca5c78f9b10ca04">PxArticulationJointBase::PxArticulationJointBase</a></div><div class="ttdeci">PX_INLINE PxArticulationJointBase(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:178</div></div>
-<div class="ttc" id="structPxArticulationAxis_html_a5c835f1d727bfc738d1fc7a7595d7395"><div class="ttname"><a href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">PxArticulationAxis::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:72</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html_aac7d969924423782bca5c78f9b10ca04"><div class="ttname"><a href="classPxArticulationJointBase.html#aac7d969924423782bca5c78f9b10ca04">PxArticulationJointBase::PxArticulationJointBase</a></div><div class="ttdeci">PX_INLINE PxArticulationJointBase(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:176</div></div>
+<div class="ttc" id="structPxArticulationAxis_html_a5c835f1d727bfc738d1fc7a7595d7395"><div class="ttname"><a href="structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395">PxArticulationAxis::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:70</div></div>
 <div class="ttc" id="classPxBase_html"><div class="ttname"><a href="classPxBase.html">PxBase</a></div><div class="ttdoc">Base class for objects that can be members of a PxCollection. </div><div class="ttdef"><b>Definition:</b> PxBase.h:73</div></div>
 <div class="ttc" id="group__foundation_html_gacb03347b642a2a5bdea1f9b305a6fbec"><div class="ttname"><a href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a></div><div class="ttdeci">#define PX_INLINE</div><div class="ttdef"><b>Definition:</b> PxPreprocessor.h:349</div></div>
-<div class="ttc" id="structPxArticulationMotion_html"><div class="ttname"><a href="structPxArticulationMotion.html">PxArticulationMotion</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:86</div></div>
+<div class="ttc" id="structPxArticulationMotion_html"><div class="ttname"><a href="structPxArticulationMotion.html">PxArticulationMotion</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:84</div></div>
 <div class="ttc" id="classPxVec3_html"><div class="ttname"><a href="classPxVec3.html">PxVec3</a></div><div class="ttdoc">3 Element vector class. </div><div class="ttdef"><b>Definition:</b> PxVec3.h:49</div></div>
-<div class="ttc" id="classPxArticulationJoint_html_a830694ba6d6b8bbe5dfdc71f446bac54"><div class="ttname"><a href="classPxArticulationJoint.html#a830694ba6d6b8bbe5dfdc71f446bac54">PxArticulationJoint::PxArticulationJoint</a></div><div class="ttdeci">PX_INLINE PxArticulationJoint(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:564</div></div>
+<div class="ttc" id="classPxArticulationJoint_html_a830694ba6d6b8bbe5dfdc71f446bac54"><div class="ttname"><a href="classPxArticulationJoint.html#a830694ba6d6b8bbe5dfdc71f446bac54">PxArticulationJoint::PxArticulationJoint</a></div><div class="ttdeci">PX_INLINE PxArticulationJoint(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:562</div></div>
 </div><!-- fragment --></div><!-- contents -->
 </div><!-- doc-content -->
 <!-- HTML footer for doxygen 1.8.14-->
diff --git a/physx/documentation/PhysXAPI/files/PxArticulationLink_8h_source.html b/physx/documentation/PhysXAPI/files/PxArticulationLink_8h_source.html
index 9894cce62..c6a4c27e8 100644
--- a/physx/documentation/PhysXAPI/files/PxArticulationLink_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxArticulationLink_8h_source.html
@@ -95,7 +95,7 @@
 <div class="ttc" id="group__common_html_gac1fb4b256a5d900d394e89db170a2b79"><div class="ttname"><a href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a></div><div class="ttdeci">PxU16 PxType</div><div class="ttdef"><b>Definition:</b> PxBase.h:49</div></div>
 <div class="ttc" id="classPxArticulationLink_html"><div class="ttname"><a href="classPxArticulationLink.html">PxArticulationLink</a></div><div class="ttdoc">a component of an articulation that represents a rigid body </div><div class="ttdef"><b>Definition:</b> PxArticulationLink.h:57</div></div>
 <div class="ttc" id="classPxArticulationLink_html_afaa71dfb9bd3348b6cbe58902cfe63c8"><div class="ttname"><a href="classPxArticulationLink.html#afaa71dfb9bd3348b6cbe58902cfe63c8">PxArticulationLink::getConcreteTypeName</a></div><div class="ttdeci">virtual const char * getConcreteTypeName() const</div><div class="ttdoc">Returns string name of dynamic type. </div><div class="ttdef"><b>Definition:</b> PxArticulationLink.h:128</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:112</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:110</div></div>
 <div class="ttc" id="PxArticulationJoint_8h_html"><div class="ttname"><a href="PxArticulationJoint_8h.html">PxArticulationJoint.h</a></div></div>
 <div class="ttc" id="classPxArticulationBase_html"><div class="ttname"><a href="classPxArticulationBase.html">PxArticulationBase</a></div><div class="ttdoc">a tree structure of bodies connected by joints that is treated as a unit by the dynamics solver ...</div><div class="ttdef"><b>Definition:</b> PxArticulationBase.h:57</div></div>
 <div class="ttc" id="classPxFlags_html"><div class="ttname"><a href="classPxFlags.html">PxFlags&lt; PxBaseFlag::Enum, PxU16 &gt;</a></div></div>
diff --git a/physx/documentation/PhysXAPI/files/PxArticulationReducedCoordinate_8h_source.html b/physx/documentation/PhysXAPI/files/PxArticulationReducedCoordinate_8h_source.html
index 350658ac5..b43619488 100644
--- a/physx/documentation/PhysXAPI/files/PxArticulationReducedCoordinate_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxArticulationReducedCoordinate_8h_source.html
@@ -86,7 +86,7 @@
 <div class="title">PxArticulationReducedCoordinate.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="PxArticulationReducedCoordinate_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_NX_ARTICULATION_RC</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_NX_ARTICULATION_RC</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxArticulationBase_8h.html">PxArticulationBase.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxVec3_8h.html">foundation/PxVec3.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxTransform_8h.html">foundation/PxTransform.h</a>&quot;</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;</div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;{</div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;</div><div class="line"><a name="l00045"></a><span class="lineno"><a class="line" href="namespaceCm.html">   45</a></span>&#160;    <span class="keyword">namespace </span><a class="code" href="namespaceCm.html">Cm</a></div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;    {</div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;        <span class="keyword">class </span>SpatialVector;</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;    }</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;    <span class="keyword">class </span><a class="code" href="classPxContactJoint.html">PxContactJoint</a>;</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;</div><div class="line"><a name="l00052"></a><span class="lineno"><a class="line" href="structPxArticulationFlag.html">   52</a></span>&#160;    <span class="keyword">struct </span><a class="code" href="structPxArticulationFlag.html">PxArticulationFlag</a></div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    {</div><div class="line"><a name="l00054"></a><span class="lineno"><a class="line" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">   54</a></span>&#160;        <span class="keyword">enum</span> <a class="code" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">Enum</a></div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;        {</div><div class="line"><a name="l00056"></a><span class="lineno"><a class="line" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609a625f41920c4a20ea5317fdc939a74674">   56</a></span>&#160;            eFIX_BASE = (1 &lt;&lt; 1)</div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;        };</div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;    };</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;    </div><div class="line"><a name="l00061"></a><span class="lineno"><a class="line" href="group__physics.html#ga40a321fce7bcb9669d32fca769d2958a">   61</a></span>&#160;    <span class="keyword">class </span><a class="code" href="classPxJoint.html">PxJoint</a>;</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;    <span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxArticulationFlag::Enum, PxU8&gt;</a> <a class="code" href="group__physics.html#ga40a321fce7bcb9669d32fca769d2958a">PxArticulationFlags</a>;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;    PX_FLAGS_OPERATORS(<a class="code" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">PxArticulationFlag::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;    <span class="comment">//PxKinematicJacobian is in world space 6x6 matrix</span></div><div class="line"><a name="l00067"></a><span class="lineno"><a class="line" href="classPxKinematicJacobian.html">   67</a></span>&#160;    class <a class="code" href="classPxKinematicJacobian.html">PxKinematicJacobian</a></div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;    {</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;    <span class="keyword">public</span>:</div><div class="line"><a name="l00070"></a><span class="lineno">   70</span>&#160;        <span class="comment">//in each single column, top part is angular term and bottom is linear term</span></div><div class="line"><a name="l00071"></a><span class="lineno"><a class="line" href="classPxKinematicJacobian.html#a95c56e4e2501e4ad795b03930cf37aa9">   71</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> j[6][6];<span class="comment">//[column][row]</span></div><div class="line"><a name="l00072"></a><span class="lineno">   72</span>&#160;</div><div class="line"><a name="l00073"></a><span class="lineno"><a class="line" href="classPxKinematicJacobian.html#aaf6d1dd5689d3f82863a1fac901392f7">   73</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>   <a class="code" href="classPxKinematicJacobian.html#aaf6d1dd5689d3f82863a1fac901392f7">nbColumns</a>;</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;    };</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;</div><div class="line"><a name="l00076"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html">   76</a></span>&#160;    <span class="keyword">struct </span><a class="code" href="structPxArticulationRootLinkData.html">PxArticulationRootLinkData</a></div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;    {</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;</div><div class="line"><a name="l00079"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#af0dcfa25c769c89fd405eeaed52b3c1f">   79</a></span>&#160;        <a class="code" href="classPxTransform.html">PxTransform</a>             <a class="code" href="structPxArticulationRootLinkData.html#af0dcfa25c769c89fd405eeaed52b3c1f">transform</a>;</div><div class="line"><a name="l00080"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#abfaab0cef7e149ebcc4b89c774885339">   80</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#abfaab0cef7e149ebcc4b89c774885339">linVel</a>;</div><div class="line"><a name="l00081"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#aceaca88516b5febd8efc9f8086b2508d">   81</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#aceaca88516b5febd8efc9f8086b2508d">angVel</a>;</div><div class="line"><a name="l00082"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#a71f06d5b88b0dbf4b0cde1afc4e3dca7">   82</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#a71f06d5b88b0dbf4b0cde1afc4e3dca7">linAcel</a>;</div><div class="line"><a name="l00083"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#a6611210de5054a8eb06745b914fb08db">   83</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#a6611210de5054a8eb06745b914fb08db">angAcel</a>;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;    };</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;</div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html">   86</a></span>&#160;    <span class="keyword">class </span><a class="code" href="classPxArticulationCache.html">PxArticulationCache</a></div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;    {</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;    <span class="keyword">public</span>:</div><div class="line"><a name="l00089"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14e">   89</a></span>&#160;        <span class="keyword">enum</span> <a class="code" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14e">Enum</a></div><div class="line"><a name="l00090"></a><span class="lineno">   90</span>&#160;        {</div><div class="line"><a name="l00091"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea2b507b08dbfaa208d407d857295a857b">   91</a></span>&#160;            eVELOCITY       = (1 &lt;&lt; 0),     </div><div class="line"><a name="l00092"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea9bfa940c74c9da6b9ce0a27c9f6bd8f8">   92</a></span>&#160;            eACCELERATION   = (1 &lt;&lt; 1),     </div><div class="line"><a name="l00093"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea74109d30461c72534eb5a36baf352e6a">   93</a></span>&#160;            ePOSITION       = (1 &lt;&lt; 2),     </div><div class="line"><a name="l00094"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea89efd599f1473d4a795b89fb71c5012a">   94</a></span>&#160;            eFORCE          = (1 &lt;&lt; 3),     </div><div class="line"><a name="l00095"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea6066321f08d523d0b73cfbdf2b44041c">   95</a></span>&#160;            eROOT           = (1 &lt;&lt; 4),     </div><div class="line"><a name="l00096"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea86872df915168195ac00a5cb1033ee29">   96</a></span>&#160;            eALL            = (eVELOCITY | eACCELERATION | ePOSITION| eROOT)</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;        };</div><div class="line"><a name="l00098"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a750cf6f5d426723b6d6cf731bd290ea5">   98</a></span>&#160;        <a class="code" href="classPxArticulationCache.html#a750cf6f5d426723b6d6cf731bd290ea5">PxArticulationCache</a>() : coefficentMatrix(NULL), lambda(NULL)</div><div class="line"><a name="l00099"></a><span class="lineno">   99</span>&#160;        {}</div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;</div><div class="line"><a name="l00101"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a429f0639ad5e54c1f9fd365485d6a140">  101</a></span>&#160;        Cm::SpatialVector*      <a class="code" href="classPxArticulationCache.html#a429f0639ad5e54c1f9fd365485d6a140">externalForces</a>; <span class="comment">// N total number of links</span></div><div class="line"><a name="l00102"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a3f61b7c67e854c7191de844a89bb863d">  102</a></span>&#160;        <a class="code" href="classPxKinematicJacobian.html">PxKinematicJacobian</a>*    <a class="code" href="classPxArticulationCache.html#a3f61b7c67e854c7191de844a89bb863d">jacobian</a>; <span class="comment">//this store jacobian matrix</span></div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a40849b1b079c9b0e517fb7eaea488ccf">  103</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a40849b1b079c9b0e517fb7eaea488ccf">massMatrix</a>; <span class="comment">//N X N (dof X dof)</span></div><div class="line"><a name="l00104"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aab37dabd7db43ef927337da7f6ea9f5a">  104</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#aab37dabd7db43ef927337da7f6ea9f5a">jointVelocity</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00105"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a515142b078989bf5034c5a97b6978c1b">  105</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a515142b078989bf5034c5a97b6978c1b">jointAcceleration</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00106"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a64e8f0b09d6860f91d7a830c11bdddc9">  106</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a64e8f0b09d6860f91d7a830c11bdddc9">jointPosition</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00107"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a952f509c174636c1c11e8c61bda30eaa">  107</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a952f509c174636c1c11e8c61bda30eaa">jointForce</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;        <span class="comment">//application need to allocate those memory and assign them to the cache</span></div><div class="line"><a name="l00110"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aded15e3e603ff12901cf4fe27a7eddea">  110</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#aded15e3e603ff12901cf4fe27a7eddea">coefficentMatrix</a>; </div><div class="line"><a name="l00111"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aa90b660a293d6ce7984d49e8be270465">  111</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#aa90b660a293d6ce7984d49e8be270465">lambda</a>;</div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;        <span class="comment">//root link data</span></div><div class="line"><a name="l00114"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aaf42183cc319e011cd7e2dce9bc6a1ec">  114</a></span>&#160;        <a class="code" href="structPxArticulationRootLinkData.html">PxArticulationRootLinkData</a>  <a class="code" href="classPxArticulationCache.html#aaf42183cc319e011cd7e2dce9bc6a1ec">rootLinkData</a>;</div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;</div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;        <span class="comment">//These three members won&#39;t be set to zero when zeroCache get called </span></div><div class="line"><a name="l00117"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a237505e945f16c9afddb3c431f9d6a7b">  117</a></span>&#160;        <span class="keywordtype">void</span>*                   <a class="code" href="classPxArticulationCache.html#a237505e945f16c9afddb3c431f9d6a7b">scratchMemory</a>;      <span class="comment">//this is used for internal calculation</span></div><div class="line"><a name="l00118"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#ac5356bfc4c7b57dec9bdd33c698ae96c">  118</a></span>&#160;        <span class="keywordtype">void</span>*                   <a class="code" href="classPxArticulationCache.html#ac5356bfc4c7b57dec9bdd33c698ae96c">scratchAllocator</a>;</div><div class="line"><a name="l00119"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#af8becefcb89c7a26df4372809e31b343">  119</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                   <a class="code" href="classPxArticulationCache.html#af8becefcb89c7a26df4372809e31b343">version</a>; <span class="comment">//cache version. If the articulation configulation change, the cache is invalid</span></div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;        </div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;    };</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;</div><div class="line"><a name="l00124"></a><span class="lineno"><a class="line" href="group__physics.html#ga1619e335e650ffbd52cd4961bc0dea9a">  124</a></span>&#160;    <span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxArticulationCache::Enum, PxU8&gt;</a> <a class="code" href="group__physics.html#ga1619e335e650ffbd52cd4961bc0dea9a">PxArticulationCacheFlags</a>;</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;    PX_FLAGS_OPERATORS(<a class="code" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14e">PxArticulationCache::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;    </div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;<span class="preprocessor">#pragma warning(push)</span></div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;<span class="preprocessor">#pragma warning(disable : 4435)</span></div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00143"></a><span class="lineno">  143</span>&#160;</div><div class="line"><a name="l00144"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html">  144</a></span>&#160;    <span class="keyword">class </span><a class="code" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a> : <span class="keyword">public</span> <a class="code" href="classPxArticulationBase.html">PxArticulationBase</a></div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;    {</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;    <span class="keyword">public</span>:</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            release() = 0;</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            setArticulationFlags(<a class="code" href="classPxFlags.html">PxArticulationFlags</a> <a class="code" href="PxConstraintDesc_8h.html#acfdd81caa30ceb0af5fafb4064b1bc67">flags</a>) = 0;</div><div class="line"><a name="l00159"></a><span class="lineno">  159</span>&#160;</div><div class="line"><a name="l00163"></a><span class="lineno">  163</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="classPxFlags.html">PxArticulationFlags</a>             getArticulationFlags() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;</div><div class="line"><a name="l00168"></a><span class="lineno">  168</span>&#160;        <span class="keyword">virtual</span>         <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getDofs() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>*            createCache() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;</div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                           getCacheDataSize() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;</div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            zeroCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) = 0;</div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;</div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;    </div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            applyCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache, <span class="keyword">const</span> <a class="code" href="classPxFlags.html">PxArticulationCacheFlags</a> flag, <span class="keywordtype">bool</span> autowake = <span class="keyword">true</span>) = 0;</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;</div><div class="line"><a name="l00206"></a><span class="lineno">  206</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        copyInternalStateToCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache, <span class="keyword">const</span> <a class="code" href="classPxFlags.html">PxArticulationCacheFlags</a> flag) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00207"></a><span class="lineno">  207</span>&#160;    </div><div class="line"><a name="l00208"></a><span class="lineno">  208</span>&#160;</div><div class="line"><a name="l00216"></a><span class="lineno">  216</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        releaseCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00217"></a><span class="lineno">  217</span>&#160;    </div><div class="line"><a name="l00218"></a><span class="lineno">  218</span>&#160;</div><div class="line"><a name="l00224"></a><span class="lineno">  224</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        packJointData(<span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* maximum, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* reduced) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;</div><div class="line"><a name="l00231"></a><span class="lineno">  231</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        unpackJointData(<span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* reduced, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* maximum) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00232"></a><span class="lineno">  232</span>&#160;</div><div class="line"><a name="l00236"></a><span class="lineno">  236</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        commonInit() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00237"></a><span class="lineno">  237</span>&#160;</div><div class="line"><a name="l00238"></a><span class="lineno">  238</span>&#160;</div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeGeneralizedGravityForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;</div><div class="line"><a name="l00257"></a><span class="lineno">  257</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeCoriolisAndCentrifugalForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00258"></a><span class="lineno">  258</span>&#160;</div><div class="line"><a name="l00267"></a><span class="lineno">  267</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeGeneralizedExternalForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00276"></a><span class="lineno">  276</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeJointAcceleration(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00277"></a><span class="lineno">  277</span>&#160;</div><div class="line"><a name="l00278"></a><span class="lineno">  278</span>&#160;</div><div class="line"><a name="l00288"></a><span class="lineno">  288</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeJointForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00289"></a><span class="lineno">  289</span>&#160;</div><div class="line"><a name="l00297"></a><span class="lineno">  297</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeKinematicJacobian(<span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> linkID, <a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00298"></a><span class="lineno">  298</span>&#160;</div><div class="line"><a name="l00299"></a><span class="lineno">  299</span>&#160;</div><div class="line"><a name="l00305"></a><span class="lineno">  305</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeCoefficentMatrix(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00306"></a><span class="lineno">  306</span>&#160;</div><div class="line"><a name="l00307"></a><span class="lineno">  307</span>&#160;        </div><div class="line"><a name="l00316"></a><span class="lineno">  316</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                        computeLambda(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache, <a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; initialState, <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* <span class="keyword">const</span> jointTorque, <span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> maxIter) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00317"></a><span class="lineno">  317</span>&#160;        </div><div class="line"><a name="l00324"></a><span class="lineno">  324</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeGeneralizedMassMatrix(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00325"></a><span class="lineno">  325</span>&#160;    </div><div class="line"><a name="l00332"></a><span class="lineno">  332</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        addLoopJoint(<a class="code" href="classPxJoint.html">PxJoint</a>* joint) = 0;</div><div class="line"><a name="l00333"></a><span class="lineno">  333</span>&#160;</div><div class="line"><a name="l00340"></a><span class="lineno">  340</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        removeLoopJoint(<a class="code" href="classPxJoint.html">PxJoint</a>* joint) = 0;</div><div class="line"><a name="l00341"></a><span class="lineno">  341</span>&#160;</div><div class="line"><a name="l00347"></a><span class="lineno">  347</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getNbLoopJoints() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00348"></a><span class="lineno">  348</span>&#160;</div><div class="line"><a name="l00362"></a><span class="lineno">  362</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getLoopJoints(<a class="code" href="classPxJoint.html">PxJoint</a>** userBuffer, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> bufferSize, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> startIndex = 0) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00363"></a><span class="lineno">  363</span>&#160;</div><div class="line"><a name="l00369"></a><span class="lineno">  369</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getCoefficentMatrixSize() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00370"></a><span class="lineno">  370</span>&#160;</div><div class="line"><a name="l00378"></a><span class="lineno">  378</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        teleportRootLink(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose, <span class="keywordtype">bool</span> autowake) = 0;</div><div class="line"><a name="l00379"></a><span class="lineno">  379</span>&#160;</div><div class="line"><a name="l00380"></a><span class="lineno">  380</span>&#160;    <span class="keyword">protected</span>:</div><div class="line"><a name="l00381"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html#a3202bc9b19b72c88c7ea32421dce3cc9">  381</a></span>&#160;        <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationReducedCoordinate.html#a3202bc9b19b72c88c7ea32421dce3cc9">PxArticulationReducedCoordinate</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationBase.html">PxArticulationBase</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00382"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html#a51b6d2bcecd81b268219e7a5d1d8f10c">  382</a></span>&#160;        <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationReducedCoordinate.html#a51b6d2bcecd81b268219e7a5d1d8f10c">PxArticulationReducedCoordinate</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationBase.html">PxArticulationBase</a>(baseFlags) {}</div><div class="line"><a name="l00383"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">  383</a></span>&#160;        <span class="keyword">virtual</span>                     <a class="code" href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">~PxArticulationReducedCoordinate</a>() {}</div><div class="line"><a name="l00384"></a><span class="lineno">  384</span>&#160;    };</div><div class="line"><a name="l00385"></a><span class="lineno">  385</span>&#160;</div><div class="line"><a name="l00386"></a><span class="lineno">  386</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00387"></a><span class="lineno">  387</span>&#160;<span class="preprocessor">#pragma warning(pop)</span></div><div class="line"><a name="l00388"></a><span class="lineno">  388</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00389"></a><span class="lineno">  389</span>&#160;</div><div class="line"><a name="l00390"></a><span class="lineno">  390</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00391"></a><span class="lineno">  391</span>&#160;} <span class="comment">// namespace physx</span></div><div class="line"><a name="l00392"></a><span class="lineno">  392</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00393"></a><span class="lineno">  393</span>&#160;</div><div class="line"><a name="l00394"></a><span class="lineno">  394</span>&#160;</div><div class="line"><a name="l00396"></a><span class="lineno">  396</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
+<a href="PxArticulationReducedCoordinate_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_NX_ARTICULATION_RC</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_NX_ARTICULATION_RC</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxArticulationBase_8h.html">PxArticulationBase.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxVec3_8h.html">foundation/PxVec3.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxTransform_8h.html">foundation/PxTransform.h</a>&quot;</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;</div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;{</div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;</div><div class="line"><a name="l00045"></a><span class="lineno"><a class="line" href="namespaceCm.html">   45</a></span>&#160;    <span class="keyword">namespace </span><a class="code" href="namespaceCm.html">Cm</a></div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;    {</div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;        <span class="keyword">class </span>SpatialVector;</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;    }</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;    <span class="keyword">class </span><a class="code" href="classPxContactJoint.html">PxContactJoint</a>;</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;</div><div class="line"><a name="l00052"></a><span class="lineno"><a class="line" href="structPxArticulationFlag.html">   52</a></span>&#160;    <span class="keyword">struct </span><a class="code" href="structPxArticulationFlag.html">PxArticulationFlag</a></div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    {</div><div class="line"><a name="l00054"></a><span class="lineno"><a class="line" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">   54</a></span>&#160;        <span class="keyword">enum</span> <a class="code" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">Enum</a></div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;        {</div><div class="line"><a name="l00056"></a><span class="lineno"><a class="line" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609a625f41920c4a20ea5317fdc939a74674">   56</a></span>&#160;            eFIX_BASE = (1 &lt;&lt; 1)</div><div class="line"><a name="l00057"></a><span class="lineno">   57</span>&#160;        };</div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;    };</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;    </div><div class="line"><a name="l00061"></a><span class="lineno"><a class="line" href="group__physics.html#ga40a321fce7bcb9669d32fca769d2958a">   61</a></span>&#160;    <span class="keyword">class </span><a class="code" href="classPxJoint.html">PxJoint</a>;</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;</div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;    <span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxArticulationFlag::Enum, PxU8&gt;</a> <a class="code" href="group__physics.html#ga40a321fce7bcb9669d32fca769d2958a">PxArticulationFlags</a>;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;    PX_FLAGS_OPERATORS(<a class="code" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">PxArticulationFlag::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;    <span class="comment">//PxKinematicJacobian is in world space 6x6 matrix</span></div><div class="line"><a name="l00067"></a><span class="lineno"><a class="line" href="classPxKinematicJacobian.html">   67</a></span>&#160;    class <a class="code" href="classPxKinematicJacobian.html">PxKinematicJacobian</a></div><div class="line"><a name="l00068"></a><span class="lineno">   68</span>&#160;    {</div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;    <span class="keyword">public</span>:</div><div class="line"><a name="l00070"></a><span class="lineno">   70</span>&#160;        <span class="comment">//in each single column, top part is angular term and bottom is linear term</span></div><div class="line"><a name="l00071"></a><span class="lineno"><a class="line" href="classPxKinematicJacobian.html#a95c56e4e2501e4ad795b03930cf37aa9">   71</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> j[6][6];<span class="comment">//[column][row]</span></div><div class="line"><a name="l00072"></a><span class="lineno">   72</span>&#160;</div><div class="line"><a name="l00073"></a><span class="lineno"><a class="line" href="classPxKinematicJacobian.html#aaf6d1dd5689d3f82863a1fac901392f7">   73</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>   <a class="code" href="classPxKinematicJacobian.html#aaf6d1dd5689d3f82863a1fac901392f7">nbColumns</a>;</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;    };</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;</div><div class="line"><a name="l00076"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html">   76</a></span>&#160;    <span class="keyword">struct </span><a class="code" href="structPxArticulationRootLinkData.html">PxArticulationRootLinkData</a></div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;    {</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;</div><div class="line"><a name="l00079"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#af0dcfa25c769c89fd405eeaed52b3c1f">   79</a></span>&#160;        <a class="code" href="classPxTransform.html">PxTransform</a>             <a class="code" href="structPxArticulationRootLinkData.html#af0dcfa25c769c89fd405eeaed52b3c1f">transform</a>;</div><div class="line"><a name="l00080"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#abfaab0cef7e149ebcc4b89c774885339">   80</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#abfaab0cef7e149ebcc4b89c774885339">linVel</a>;</div><div class="line"><a name="l00081"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#aceaca88516b5febd8efc9f8086b2508d">   81</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#aceaca88516b5febd8efc9f8086b2508d">angVel</a>;</div><div class="line"><a name="l00082"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#a71f06d5b88b0dbf4b0cde1afc4e3dca7">   82</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#a71f06d5b88b0dbf4b0cde1afc4e3dca7">linAcel</a>;</div><div class="line"><a name="l00083"></a><span class="lineno"><a class="line" href="structPxArticulationRootLinkData.html#a6611210de5054a8eb06745b914fb08db">   83</a></span>&#160;        <a class="code" href="classPxVec3.html">PxVec3</a>                  <a class="code" href="structPxArticulationRootLinkData.html#a6611210de5054a8eb06745b914fb08db">angAcel</a>;</div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;    };</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;</div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html">   86</a></span>&#160;    <span class="keyword">class </span><a class="code" href="classPxArticulationCache.html">PxArticulationCache</a></div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;    {</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;    <span class="keyword">public</span>:</div><div class="line"><a name="l00089"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14e">   89</a></span>&#160;        <span class="keyword">enum</span> <a class="code" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14e">Enum</a></div><div class="line"><a name="l00090"></a><span class="lineno">   90</span>&#160;        {</div><div class="line"><a name="l00091"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea2b507b08dbfaa208d407d857295a857b">   91</a></span>&#160;            eVELOCITY       = (1 &lt;&lt; 0),     </div><div class="line"><a name="l00092"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea9bfa940c74c9da6b9ce0a27c9f6bd8f8">   92</a></span>&#160;            eACCELERATION   = (1 &lt;&lt; 1),     </div><div class="line"><a name="l00093"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea74109d30461c72534eb5a36baf352e6a">   93</a></span>&#160;            ePOSITION       = (1 &lt;&lt; 2),     </div><div class="line"><a name="l00094"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea89efd599f1473d4a795b89fb71c5012a">   94</a></span>&#160;            eFORCE          = (1 &lt;&lt; 3),     </div><div class="line"><a name="l00095"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea6066321f08d523d0b73cfbdf2b44041c">   95</a></span>&#160;            eROOT           = (1 &lt;&lt; 4),     </div><div class="line"><a name="l00096"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14ea86872df915168195ac00a5cb1033ee29">   96</a></span>&#160;            eALL            = (eVELOCITY | eACCELERATION | ePOSITION| eROOT)</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;        };</div><div class="line"><a name="l00098"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a750cf6f5d426723b6d6cf731bd290ea5">   98</a></span>&#160;        <a class="code" href="classPxArticulationCache.html#a750cf6f5d426723b6d6cf731bd290ea5">PxArticulationCache</a>() : coefficentMatrix(NULL), lambda(NULL)</div><div class="line"><a name="l00099"></a><span class="lineno">   99</span>&#160;        {}</div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;</div><div class="line"><a name="l00101"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a429f0639ad5e54c1f9fd365485d6a140">  101</a></span>&#160;        Cm::SpatialVector*      <a class="code" href="classPxArticulationCache.html#a429f0639ad5e54c1f9fd365485d6a140">externalForces</a>; <span class="comment">// N total number of links</span></div><div class="line"><a name="l00102"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a3f61b7c67e854c7191de844a89bb863d">  102</a></span>&#160;        <a class="code" href="classPxKinematicJacobian.html">PxKinematicJacobian</a>*    <a class="code" href="classPxArticulationCache.html#a3f61b7c67e854c7191de844a89bb863d">jacobian</a>; <span class="comment">//this store jacobian matrix</span></div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a40849b1b079c9b0e517fb7eaea488ccf">  103</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a40849b1b079c9b0e517fb7eaea488ccf">massMatrix</a>; <span class="comment">//N X N (dof X dof)</span></div><div class="line"><a name="l00104"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aab37dabd7db43ef927337da7f6ea9f5a">  104</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#aab37dabd7db43ef927337da7f6ea9f5a">jointVelocity</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00105"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a515142b078989bf5034c5a97b6978c1b">  105</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a515142b078989bf5034c5a97b6978c1b">jointAcceleration</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00106"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a64e8f0b09d6860f91d7a830c11bdddc9">  106</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a64e8f0b09d6860f91d7a830c11bdddc9">jointPosition</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00107"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a952f509c174636c1c11e8c61bda30eaa">  107</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#a952f509c174636c1c11e8c61bda30eaa">jointForce</a>; <span class="comment">//N total Dofs</span></div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;        <span class="comment">//application need to allocate those memory and assign them to the cache</span></div><div class="line"><a name="l00110"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aded15e3e603ff12901cf4fe27a7eddea">  110</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#aded15e3e603ff12901cf4fe27a7eddea">coefficentMatrix</a>; </div><div class="line"><a name="l00111"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aa90b660a293d6ce7984d49e8be270465">  111</a></span>&#160;        <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>*                 <a class="code" href="classPxArticulationCache.html#aa90b660a293d6ce7984d49e8be270465">lambda</a>;</div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;        <span class="comment">//root link data</span></div><div class="line"><a name="l00114"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#aaf42183cc319e011cd7e2dce9bc6a1ec">  114</a></span>&#160;        <a class="code" href="structPxArticulationRootLinkData.html">PxArticulationRootLinkData</a>  <a class="code" href="classPxArticulationCache.html#aaf42183cc319e011cd7e2dce9bc6a1ec">rootLinkData</a>;</div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;</div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;        <span class="comment">//These three members won&#39;t be set to zero when zeroCache get called </span></div><div class="line"><a name="l00117"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#a237505e945f16c9afddb3c431f9d6a7b">  117</a></span>&#160;        <span class="keywordtype">void</span>*                   <a class="code" href="classPxArticulationCache.html#a237505e945f16c9afddb3c431f9d6a7b">scratchMemory</a>;      <span class="comment">//this is used for internal calculation</span></div><div class="line"><a name="l00118"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#ac5356bfc4c7b57dec9bdd33c698ae96c">  118</a></span>&#160;        <span class="keywordtype">void</span>*                   <a class="code" href="classPxArticulationCache.html#ac5356bfc4c7b57dec9bdd33c698ae96c">scratchAllocator</a>;</div><div class="line"><a name="l00119"></a><span class="lineno"><a class="line" href="classPxArticulationCache.html#af8becefcb89c7a26df4372809e31b343">  119</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                   <a class="code" href="classPxArticulationCache.html#af8becefcb89c7a26df4372809e31b343">version</a>; <span class="comment">//cache version. If the articulation configulation change, the cache is invalid</span></div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;        </div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;    };</div><div class="line"><a name="l00123"></a><span class="lineno">  123</span>&#160;</div><div class="line"><a name="l00124"></a><span class="lineno"><a class="line" href="group__physics.html#ga1619e335e650ffbd52cd4961bc0dea9a">  124</a></span>&#160;    <span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxArticulationCache::Enum, PxU8&gt;</a> <a class="code" href="group__physics.html#ga1619e335e650ffbd52cd4961bc0dea9a">PxArticulationCacheFlags</a>;</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;    PX_FLAGS_OPERATORS(<a class="code" href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14e">PxArticulationCache::Enum</a>, <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>)</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;    </div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;<span class="preprocessor">#pragma warning(push)</span></div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;<span class="preprocessor">#pragma warning(disable : 4435)</span></div><div class="line"><a name="l00142"></a><span class="lineno">  142</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00143"></a><span class="lineno">  143</span>&#160;</div><div class="line"><a name="l00144"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html">  144</a></span>&#160;    <span class="keyword">class </span><a class="code" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a> : <span class="keyword">public</span> <a class="code" href="classPxArticulationBase.html">PxArticulationBase</a></div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;    {</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;    <span class="keyword">public</span>:</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            release() = 0;</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;</div><div class="line"><a name="l00150"></a><span class="lineno">  150</span>&#160;</div><div class="line"><a name="l00157"></a><span class="lineno">  157</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            setArticulationFlags(<a class="code" href="classPxFlags.html">PxArticulationFlags</a> <a class="code" href="PxConstraintDesc_8h.html#acfdd81caa30ceb0af5fafb4064b1bc67">flags</a>) = 0;</div><div class="line"><a name="l00158"></a><span class="lineno">  158</span>&#160;</div><div class="line"><a name="l00166"></a><span class="lineno">  166</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            setArticulationFlag(<a class="code" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">PxArticulationFlag::Enum</a> flag, <span class="keywordtype">bool</span> value) = 0;</div><div class="line"><a name="l00167"></a><span class="lineno">  167</span>&#160;</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="classPxFlags.html">PxArticulationFlags</a>             getArticulationFlags() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;        <span class="keyword">virtual</span>         <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getDofs() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>*            createCache() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;</div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                           getCacheDataSize() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            zeroCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) = 0;</div><div class="line"><a name="l00198"></a><span class="lineno">  198</span>&#160;</div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;    </div><div class="line"><a name="l00209"></a><span class="lineno">  209</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                            applyCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache, <span class="keyword">const</span> <a class="code" href="classPxFlags.html">PxArticulationCacheFlags</a> flag, <span class="keywordtype">bool</span> autowake = <span class="keyword">true</span>) = 0;</div><div class="line"><a name="l00210"></a><span class="lineno">  210</span>&#160;</div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        copyInternalStateToCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache, <span class="keyword">const</span> <a class="code" href="classPxFlags.html">PxArticulationCacheFlags</a> flag) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00220"></a><span class="lineno">  220</span>&#160;    </div><div class="line"><a name="l00221"></a><span class="lineno">  221</span>&#160;</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        releaseCache(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00230"></a><span class="lineno">  230</span>&#160;    </div><div class="line"><a name="l00231"></a><span class="lineno">  231</span>&#160;</div><div class="line"><a name="l00237"></a><span class="lineno">  237</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        packJointData(<span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* maximum, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* reduced) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00238"></a><span class="lineno">  238</span>&#160;</div><div class="line"><a name="l00244"></a><span class="lineno">  244</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        unpackJointData(<span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* reduced, <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* maximum) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;</div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        commonInit() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;</div><div class="line"><a name="l00260"></a><span class="lineno">  260</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeGeneralizedGravityForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00261"></a><span class="lineno">  261</span>&#160;</div><div class="line"><a name="l00270"></a><span class="lineno">  270</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeCoriolisAndCentrifugalForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00271"></a><span class="lineno">  271</span>&#160;</div><div class="line"><a name="l00280"></a><span class="lineno">  280</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeGeneralizedExternalForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00289"></a><span class="lineno">  289</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeJointAcceleration(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00290"></a><span class="lineno">  290</span>&#160;</div><div class="line"><a name="l00291"></a><span class="lineno">  291</span>&#160;</div><div class="line"><a name="l00301"></a><span class="lineno">  301</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeJointForce(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00302"></a><span class="lineno">  302</span>&#160;</div><div class="line"><a name="l00310"></a><span class="lineno">  310</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeKinematicJacobian(<span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> linkID, <a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00311"></a><span class="lineno">  311</span>&#160;</div><div class="line"><a name="l00312"></a><span class="lineno">  312</span>&#160;</div><div class="line"><a name="l00318"></a><span class="lineno">  318</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeCoefficentMatrix(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00319"></a><span class="lineno">  319</span>&#160;</div><div class="line"><a name="l00320"></a><span class="lineno">  320</span>&#160;        </div><div class="line"><a name="l00329"></a><span class="lineno">  329</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                        computeLambda(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache, <a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; initialState, <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* <span class="keyword">const</span> jointTorque, <span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> maxIter) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00330"></a><span class="lineno">  330</span>&#160;        </div><div class="line"><a name="l00337"></a><span class="lineno">  337</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        computeGeneralizedMassMatrix(<a class="code" href="classPxArticulationCache.html">PxArticulationCache</a>&amp; cache) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00338"></a><span class="lineno">  338</span>&#160;    </div><div class="line"><a name="l00345"></a><span class="lineno">  345</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        addLoopJoint(<a class="code" href="classPxJoint.html">PxJoint</a>* joint) = 0;</div><div class="line"><a name="l00346"></a><span class="lineno">  346</span>&#160;</div><div class="line"><a name="l00353"></a><span class="lineno">  353</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        removeLoopJoint(<a class="code" href="classPxJoint.html">PxJoint</a>* joint) = 0;</div><div class="line"><a name="l00354"></a><span class="lineno">  354</span>&#160;</div><div class="line"><a name="l00360"></a><span class="lineno">  360</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getNbLoopJoints() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00361"></a><span class="lineno">  361</span>&#160;</div><div class="line"><a name="l00375"></a><span class="lineno">  375</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getLoopJoints(<a class="code" href="classPxJoint.html">PxJoint</a>** userBuffer, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> bufferSize, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> startIndex = 0) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00376"></a><span class="lineno">  376</span>&#160;</div><div class="line"><a name="l00382"></a><span class="lineno">  382</span>&#160;        <span class="keyword">virtual</span>     <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>                       getCoefficentMatrixSize() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00383"></a><span class="lineno">  383</span>&#160;</div><div class="line"><a name="l00391"></a><span class="lineno">  391</span>&#160;        <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                        teleportRootLink(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose, <span class="keywordtype">bool</span> autowake) = 0;</div><div class="line"><a name="l00392"></a><span class="lineno">  392</span>&#160;</div><div class="line"><a name="l00393"></a><span class="lineno">  393</span>&#160;    <span class="keyword">protected</span>:</div><div class="line"><a name="l00394"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html#a3202bc9b19b72c88c7ea32421dce3cc9">  394</a></span>&#160;        <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationReducedCoordinate.html#a3202bc9b19b72c88c7ea32421dce3cc9">PxArticulationReducedCoordinate</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationBase.html">PxArticulationBase</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00395"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html#a51b6d2bcecd81b268219e7a5d1d8f10c">  395</a></span>&#160;        <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                   <a class="code" href="classPxArticulationReducedCoordinate.html#a51b6d2bcecd81b268219e7a5d1d8f10c">PxArticulationReducedCoordinate</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxArticulationBase.html">PxArticulationBase</a>(baseFlags) {}</div><div class="line"><a name="l00396"></a><span class="lineno"><a class="line" href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">  396</a></span>&#160;        <span class="keyword">virtual</span>                     <a class="code" href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">~PxArticulationReducedCoordinate</a>() {}</div><div class="line"><a name="l00397"></a><span class="lineno">  397</span>&#160;    };</div><div class="line"><a name="l00398"></a><span class="lineno">  398</span>&#160;</div><div class="line"><a name="l00399"></a><span class="lineno">  399</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00400"></a><span class="lineno">  400</span>&#160;<span class="preprocessor">#pragma warning(pop)</span></div><div class="line"><a name="l00401"></a><span class="lineno">  401</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00402"></a><span class="lineno">  402</span>&#160;</div><div class="line"><a name="l00403"></a><span class="lineno">  403</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00404"></a><span class="lineno">  404</span>&#160;} <span class="comment">// namespace physx</span></div><div class="line"><a name="l00405"></a><span class="lineno">  405</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00406"></a><span class="lineno">  406</span>&#160;</div><div class="line"><a name="l00407"></a><span class="lineno">  407</span>&#160;</div><div class="line"><a name="l00409"></a><span class="lineno">  409</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
 <div class="ttc" id="PxArticulationBase_8h_html"><div class="ttname"><a href="PxArticulationBase_8h.html">PxArticulationBase.h</a></div></div>
 <div class="ttc" id="classPxKinematicJacobian_html_aaf6d1dd5689d3f82863a1fac901392f7"><div class="ttname"><a href="classPxKinematicJacobian.html#aaf6d1dd5689d3f82863a1fac901392f7">PxKinematicJacobian::nbColumns</a></div><div class="ttdeci">PxU32 nbColumns</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:73</div></div>
 <div class="ttc" id="classPxArticulationCache_html_af8becefcb89c7a26df4372809e31b343"><div class="ttname"><a href="classPxArticulationCache.html#af8becefcb89c7a26df4372809e31b343">PxArticulationCache::version</a></div><div class="ttdeci">PxU32 version</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:119</div></div>
@@ -94,7 +94,7 @@
 <div class="ttc" id="classPxArticulationCache_html_aab37dabd7db43ef927337da7f6ea9f5a"><div class="ttname"><a href="classPxArticulationCache.html#aab37dabd7db43ef927337da7f6ea9f5a">PxArticulationCache::jointVelocity</a></div><div class="ttdeci">PxReal * jointVelocity</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:104</div></div>
 <div class="ttc" id="namespacephysx_html_a727d2d8426e2a21ebbc522fa65c3f97a"><div class="ttname"><a href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">physx::PxReal</a></div><div class="ttdeci">float PxReal</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:78</div></div>
 <div class="ttc" id="classPxKinematicJacobian_html"><div class="ttname"><a href="classPxKinematicJacobian.html">PxKinematicJacobian</a></div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:67</div></div>
-<div class="ttc" id="classPxArticulationReducedCoordinate_html_a3202bc9b19b72c88c7ea32421dce3cc9"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html#a3202bc9b19b72c88c7ea32421dce3cc9">PxArticulationReducedCoordinate::PxArticulationReducedCoordinate</a></div><div class="ttdeci">PX_INLINE PxArticulationReducedCoordinate(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:381</div></div>
+<div class="ttc" id="classPxArticulationReducedCoordinate_html_a3202bc9b19b72c88c7ea32421dce3cc9"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html#a3202bc9b19b72c88c7ea32421dce3cc9">PxArticulationReducedCoordinate::PxArticulationReducedCoordinate</a></div><div class="ttdeci">PX_INLINE PxArticulationReducedCoordinate(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:394</div></div>
 <div class="ttc" id="structPxArticulationRootLinkData_html_af0dcfa25c769c89fd405eeaed52b3c1f"><div class="ttname"><a href="structPxArticulationRootLinkData.html#af0dcfa25c769c89fd405eeaed52b3c1f">PxArticulationRootLinkData::transform</a></div><div class="ttdeci">PxTransform transform</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:79</div></div>
 <div class="ttc" id="structPxArticulationFlag_html"><div class="ttname"><a href="structPxArticulationFlag.html">PxArticulationFlag</a></div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:52</div></div>
 <div class="ttc" id="group__physics_html_ga40a321fce7bcb9669d32fca769d2958a"><div class="ttname"><a href="group__physics.html#ga40a321fce7bcb9669d32fca769d2958a">PxArticulationFlags</a></div><div class="ttdeci">PxFlags&lt; PxArticulationFlag::Enum, PxU8 &gt; PxArticulationFlags</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:61</div></div>
@@ -107,7 +107,7 @@
 <div class="ttc" id="classPxJoint_html"><div class="ttname"><a href="classPxJoint.html">PxJoint</a></div><div class="ttdoc">a base interface providing common functionality for PhysX joints </div><div class="ttdef"><b>Definition:</b> PxJoint.h:101</div></div>
 <div class="ttc" id="classPxArticulationReducedCoordinate_html"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></div><div class="ttdoc">a tree structure of bodies connected by joints that is treated as a unit by the dynamics solver ...</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:144</div></div>
 <div class="ttc" id="PxConstraintDesc_8h_html_acfdd81caa30ceb0af5fafb4064b1bc67"><div class="ttname"><a href="PxConstraintDesc_8h.html#acfdd81caa30ceb0af5fafb4064b1bc67">flags</a></div><div class="ttdeci">PxU16 flags</div><div class="ttdoc">a set of Px1DConstraintFlags </div><div class="ttdef"><b>Definition:</b> PxConstraintDesc.h:110</div></div>
-<div class="ttc" id="classPxArticulationReducedCoordinate_html_a1a6a5ea9fbce5fab1389809cde5ce12a"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">PxArticulationReducedCoordinate::~PxArticulationReducedCoordinate</a></div><div class="ttdeci">virtual ~PxArticulationReducedCoordinate()</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:383</div></div>
+<div class="ttc" id="classPxArticulationReducedCoordinate_html_a1a6a5ea9fbce5fab1389809cde5ce12a"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">PxArticulationReducedCoordinate::~PxArticulationReducedCoordinate</a></div><div class="ttdeci">virtual ~PxArticulationReducedCoordinate()</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:396</div></div>
 <div class="ttc" id="classPxArticulationCache_html_aded15e3e603ff12901cf4fe27a7eddea"><div class="ttname"><a href="classPxArticulationCache.html#aded15e3e603ff12901cf4fe27a7eddea">PxArticulationCache::coefficentMatrix</a></div><div class="ttdeci">PxReal * coefficentMatrix</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:110</div></div>
 <div class="ttc" id="classPxArticulationCache_html_aa90b660a293d6ce7984d49e8be270465"><div class="ttname"><a href="classPxArticulationCache.html#aa90b660a293d6ce7984d49e8be270465">PxArticulationCache::lambda</a></div><div class="ttdeci">PxReal * lambda</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:111</div></div>
 <div class="ttc" id="namespacephysx_html_a3849f86abc21d3a58949481603fe8309"><div class="ttname"><a href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">physx::PxU8</a></div><div class="ttdeci">uint8_t PxU8</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:75</div></div>
@@ -119,7 +119,7 @@
 <div class="ttc" id="PxTransform_8h_html"><div class="ttname"><a href="PxTransform_8h.html">PxTransform.h</a></div></div>
 <div class="ttc" id="classPxArticulationCache_html_a17beddf109c14739cacb3ddc833ea14e"><div class="ttname"><a href="classPxArticulationCache.html#a17beddf109c14739cacb3ddc833ea14e">PxArticulationCache::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:89</div></div>
 <div class="ttc" id="classPxArticulationCache_html_a237505e945f16c9afddb3c431f9d6a7b"><div class="ttname"><a href="classPxArticulationCache.html#a237505e945f16c9afddb3c431f9d6a7b">PxArticulationCache::scratchMemory</a></div><div class="ttdeci">void * scratchMemory</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:117</div></div>
-<div class="ttc" id="classPxArticulationReducedCoordinate_html_a51b6d2bcecd81b268219e7a5d1d8f10c"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html#a51b6d2bcecd81b268219e7a5d1d8f10c">PxArticulationReducedCoordinate::PxArticulationReducedCoordinate</a></div><div class="ttdeci">PX_INLINE PxArticulationReducedCoordinate(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:382</div></div>
+<div class="ttc" id="classPxArticulationReducedCoordinate_html_a51b6d2bcecd81b268219e7a5d1d8f10c"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html#a51b6d2bcecd81b268219e7a5d1d8f10c">PxArticulationReducedCoordinate::PxArticulationReducedCoordinate</a></div><div class="ttdeci">PX_INLINE PxArticulationReducedCoordinate(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:395</div></div>
 <div class="ttc" id="namespaceCm_html"><div class="ttname"><a href="namespaceCm.html">Cm</a></div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:45</div></div>
 <div class="ttc" id="classPxArticulationBase_html"><div class="ttname"><a href="classPxArticulationBase.html">PxArticulationBase</a></div><div class="ttdoc">a tree structure of bodies connected by joints that is treated as a unit by the dynamics solver ...</div><div class="ttdef"><b>Definition:</b> PxArticulationBase.h:57</div></div>
 <div class="ttc" id="classPxArticulationCache_html_aaf42183cc319e011cd7e2dce9bc6a1ec"><div class="ttname"><a href="classPxArticulationCache.html#aaf42183cc319e011cd7e2dce9bc6a1ec">PxArticulationCache::rootLinkData</a></div><div class="ttdeci">PxArticulationRootLinkData rootLinkData</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:114</div></div>
diff --git a/physx/documentation/PhysXAPI/files/PxMeshQuery_8h_source.html b/physx/documentation/PhysXAPI/files/PxMeshQuery_8h_source.html
index 412ae3535..9064122a3 100644
--- a/physx/documentation/PhysXAPI/files/PxMeshQuery_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxMeshQuery_8h_source.html
@@ -86,7 +86,7 @@
 <div class="title">PxMeshQuery.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="PxMeshQuery_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_GEOMUTILS_PX_MESH_QUERY</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_GEOMUTILS_PX_MESH_QUERY</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxPhysXCommonConfig_8h.html">common/PxPhysXCommonConfig.h</a>&quot;</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxQueryReport_8h.html">PxQueryReport.h</a>&quot;</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;{</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="keyword">class </span><a class="code" href="classPxGeometry.html">PxGeometry</a>;</div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;<span class="keyword">class </span><a class="code" href="classPxConvexMeshGeometry.html">PxConvexMeshGeometry</a>;</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;<span class="keyword">class </span><a class="code" href="classPxTriangleMeshGeometry.html">PxTriangleMeshGeometry</a>;</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;<span class="keyword">class </span><a class="code" href="classPxHeightFieldGeometry.html">PxHeightFieldGeometry</a>;</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;<span class="keyword">class </span><a class="code" href="classPxTriangle.html">PxTriangle</a>;</div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;</div><div class="line"><a name="l00053"></a><span class="lineno"><a class="line" href="classPxMeshQuery.html">   53</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxMeshQuery.html">PxMeshQuery</a></div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;{</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <span class="keywordtype">void</span> getTriangle(<span class="keyword">const</span> <a class="code" href="classPxTriangleMeshGeometry.html">PxTriangleMeshGeometry</a>&amp; triGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; transform, <a class="code" href="group__common.html#ga19403877bf7ce42d7240e4e4c758c016">PxTriangleID</a> triangleIndex, <a class="code" href="classPxTriangle.html">PxTriangle</a>&amp; triangle, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* vertexIndices=NULL, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* adjacencyIndices=NULL);</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;</div><div class="line"><a name="l00092"></a><span class="lineno">   92</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <span class="keywordtype">void</span> getTriangle(<span class="keyword">const</span> <a class="code" href="classPxHeightFieldGeometry.html">PxHeightFieldGeometry</a>&amp; hfGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; transform, <a class="code" href="group__common.html#ga19403877bf7ce42d7240e4e4c758c016">PxTriangleID</a> triangleIndex, <a class="code" href="classPxTriangle.html">PxTriangle</a>&amp; triangle, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* vertexIndices=NULL, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* adjacencyIndices=NULL);</div><div class="line"><a name="l00093"></a><span class="lineno">   93</span>&#160;</div><div class="line"><a name="l00094"></a><span class="lineno">   94</span>&#160;</div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> findOverlapTriangleMesh(   <span class="keyword">const</span> <a class="code" href="classPxGeometry.html">PxGeometry</a>&amp; geom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; geomPose,</div><div class="line"><a name="l00113"></a><span class="lineno">  113</span>&#160;                                                                <span class="keyword">const</span> <a class="code" href="classPxTriangleMeshGeometry.html">PxTriangleMeshGeometry</a>&amp; meshGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; meshPose,</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;                                                                <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* results, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> maxResults, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> startIndex, <span class="keywordtype">bool</span>&amp; overflow);</div><div class="line"><a name="l00115"></a><span class="lineno">  115</span>&#160;</div><div class="line"><a name="l00133"></a><span class="lineno">  133</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> findOverlapHeightField(<span class="keyword">const</span> <a class="code" href="classPxGeometry.html">PxGeometry</a>&amp; geom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; geomPose,</div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;                                                            <span class="keyword">const</span> <a class="code" href="classPxHeightFieldGeometry.html">PxHeightFieldGeometry</a>&amp; hfGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; hfPose,</div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;                                                            <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* results, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> maxResults, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> startIndex, <span class="keywordtype">bool</span>&amp; overflow);</div><div class="line"><a name="l00136"></a><span class="lineno">  136</span>&#160;</div><div class="line"><a name="l00137"></a><span class="lineno">  137</span>&#160;</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <span class="keywordtype">bool</span> sweep(<span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; unitDir,</div><div class="line"><a name="l00171"></a><span class="lineno">  171</span>&#160;                            <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> distance,</div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;                            <span class="keyword">const</span> <a class="code" href="classPxGeometry.html">PxGeometry</a>&amp; geom,</div><div class="line"><a name="l00173"></a><span class="lineno">  173</span>&#160;                            <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose,</div><div class="line"><a name="l00174"></a><span class="lineno">  174</span>&#160;                            <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> triangleCount,</div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;                            <span class="keyword">const</span> <a class="code" href="classPxTriangle.html">PxTriangle</a>* triangles,</div><div class="line"><a name="l00176"></a><span class="lineno">  176</span>&#160;                            <a class="code" href="structPxSweepHit.html">PxSweepHit</a>&amp; sweepHit,</div><div class="line"><a name="l00177"></a><span class="lineno">  177</span>&#160;                            PxHitFlags hitFlags = <a class="code" href="structPxHitFlag.html#a44c173f6ddf0522ffbd8fa3c585e6b64afbdec54374d753fed4682d855aac7270">PxHitFlag::eDEFAULT</a>,</div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;                            <span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* cachedIndex = NULL,</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;                            <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> inflation = 0.0f,</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;                            <span class="keywordtype">bool</span> doubleSided = <span class="keyword">false</span>);</div><div class="line"><a name="l00181"></a><span class="lineno">  181</span>&#160;};</div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;</div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;}</div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
+<a href="PxMeshQuery_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_GEOMUTILS_PX_MESH_QUERY</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_GEOMUTILS_PX_MESH_QUERY</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxPhysXCommonConfig_8h.html">common/PxPhysXCommonConfig.h</a>&quot;</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxQueryReport_8h.html">PxQueryReport.h</a>&quot;</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;{</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="keyword">class </span><a class="code" href="classPxGeometry.html">PxGeometry</a>;</div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;<span class="keyword">class </span><a class="code" href="classPxConvexMeshGeometry.html">PxConvexMeshGeometry</a>;</div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;<span class="keyword">class </span><a class="code" href="classPxTriangleMeshGeometry.html">PxTriangleMeshGeometry</a>;</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;<span class="keyword">class </span><a class="code" href="classPxHeightFieldGeometry.html">PxHeightFieldGeometry</a>;</div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;</div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;<span class="keyword">class </span><a class="code" href="classPxTriangle.html">PxTriangle</a>;</div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;</div><div class="line"><a name="l00053"></a><span class="lineno"><a class="line" href="classPxMeshQuery.html">   53</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxMeshQuery.html">PxMeshQuery</a></div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;{</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <span class="keywordtype">void</span> getTriangle(<span class="keyword">const</span> <a class="code" href="classPxTriangleMeshGeometry.html">PxTriangleMeshGeometry</a>&amp; triGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; transform, <a class="code" href="group__common.html#ga19403877bf7ce42d7240e4e4c758c016">PxTriangleID</a> triangleIndex, <a class="code" href="classPxTriangle.html">PxTriangle</a>&amp; triangle, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* vertexIndices=NULL, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* adjacencyIndices=NULL);</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;</div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <span class="keywordtype">void</span> getTriangle(<span class="keyword">const</span> <a class="code" href="classPxHeightFieldGeometry.html">PxHeightFieldGeometry</a>&amp; hfGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; transform, <a class="code" href="group__common.html#ga19403877bf7ce42d7240e4e4c758c016">PxTriangleID</a> triangleIndex, <a class="code" href="classPxTriangle.html">PxTriangle</a>&amp; triangle, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* vertexIndices=NULL, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* adjacencyIndices=NULL);</div><div class="line"><a name="l00105"></a><span class="lineno">  105</span>&#160;</div><div class="line"><a name="l00106"></a><span class="lineno">  106</span>&#160;</div><div class="line"><a name="l00124"></a><span class="lineno">  124</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> findOverlapTriangleMesh(   <span class="keyword">const</span> <a class="code" href="classPxGeometry.html">PxGeometry</a>&amp; geom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; geomPose,</div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;                                                                <span class="keyword">const</span> <a class="code" href="classPxTriangleMeshGeometry.html">PxTriangleMeshGeometry</a>&amp; meshGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; meshPose,</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;                                                                <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* results, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> maxResults, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> startIndex, <span class="keywordtype">bool</span>&amp; overflow);</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;</div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> findOverlapHeightField(<span class="keyword">const</span> <a class="code" href="classPxGeometry.html">PxGeometry</a>&amp; geom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; geomPose,</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;                                                            <span class="keyword">const</span> <a class="code" href="classPxHeightFieldGeometry.html">PxHeightFieldGeometry</a>&amp; hfGeom, <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; hfPose,</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;                                                            <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* results, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> maxResults, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> startIndex, <span class="keywordtype">bool</span>&amp; overflow);</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;</div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;    <a class="code" href="group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4">PX_PHYSX_COMMON_API</a> <span class="keyword">static</span> <span class="keywordtype">bool</span> sweep(<span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; unitDir,</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;                            <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> distance,</div><div class="line"><a name="l00184"></a><span class="lineno">  184</span>&#160;                            <span class="keyword">const</span> <a class="code" href="classPxGeometry.html">PxGeometry</a>&amp; geom,</div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;                            <span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; pose,</div><div class="line"><a name="l00186"></a><span class="lineno">  186</span>&#160;                            <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> triangleCount,</div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;                            <span class="keyword">const</span> <a class="code" href="classPxTriangle.html">PxTriangle</a>* triangles,</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;                            <a class="code" href="structPxSweepHit.html">PxSweepHit</a>&amp; sweepHit,</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;                            PxHitFlags hitFlags = <a class="code" href="structPxHitFlag.html#a44c173f6ddf0522ffbd8fa3c585e6b64afbdec54374d753fed4682d855aac7270">PxHitFlag::eDEFAULT</a>,</div><div class="line"><a name="l00190"></a><span class="lineno">  190</span>&#160;                            <span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>* cachedIndex = NULL,</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;                            <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> inflation = 0.0f,</div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;                            <span class="keywordtype">bool</span> doubleSided = <span class="keyword">false</span>);</div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;};</div><div class="line"><a name="l00194"></a><span class="lineno">  194</span>&#160;</div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;}</div><div class="line"><a name="l00198"></a><span class="lineno">  198</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;</div><div class="line"><a name="l00201"></a><span class="lineno">  201</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
 <div class="ttc" id="PxPhysXCommonConfig_8h_html"><div class="ttname"><a href="PxPhysXCommonConfig_8h.html">PxPhysXCommonConfig.h</a></div></div>
 <div class="ttc" id="namespacephysx_html_a727d2d8426e2a21ebbc522fa65c3f97a"><div class="ttname"><a href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">physx::PxReal</a></div><div class="ttdeci">float PxReal</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:78</div></div>
 <div class="ttc" id="PxQueryReport_8h_html"><div class="ttname"><a href="PxQueryReport_8h.html">PxQueryReport.h</a></div></div>
diff --git a/physx/documentation/PhysXAPI/files/PxPhysXConfig_8h_source.html b/physx/documentation/PhysXAPI/files/PxPhysXConfig_8h_source.html
index a62c8f202..b8aed823d 100644
--- a/physx/documentation/PhysXAPI/files/PxPhysXConfig_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxPhysXConfig_8h_source.html
@@ -91,13 +91,13 @@
 <div class="ttc" id="classPxScene_html"><div class="ttname"><a href="classPxScene.html">PxScene</a></div><div class="ttdoc">A scene is a collection of bodies and constraints which can interact. </div><div class="ttdef"><b>Definition:</b> PxScene.h:169</div></div>
 <div class="ttc" id="PxPhysXCommonConfig_8h_html"><div class="ttname"><a href="PxPhysXCommonConfig_8h.html">PxPhysXCommonConfig.h</a></div></div>
 <div class="ttc" id="classPxRigidDynamic_html"><div class="ttname"><a href="classPxRigidDynamic.html">PxRigidDynamic</a></div><div class="ttdoc">PxRigidDynamic represents a dynamic rigid simulation object in the physics SDK. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:83</div></div>
-<div class="ttc" id="classPxArticulationJoint_html"><div class="ttname"><a href="classPxArticulationJoint.html">PxArticulationJoint</a></div><div class="ttdoc">a joint between two links in an articulation. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:195</div></div>
+<div class="ttc" id="classPxArticulationJoint_html"><div class="ttname"><a href="classPxArticulationJoint.html">PxArticulationJoint</a></div><div class="ttdoc">a joint between two links in an articulation. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:193</div></div>
 <div class="ttc" id="classPxActor_html"><div class="ttname"><a href="classPxActor.html">PxActor</a></div><div class="ttdoc">PxActor is the base class for the main simulation objects in the physics SDK. </div><div class="ttdef"><b>Definition:</b> PxActor.h:154</div></div>
 <div class="ttc" id="classPxShape_html"><div class="ttname"><a href="classPxShape.html">PxShape</a></div><div class="ttdoc">Abstract class for collision shapes. </div><div class="ttdef"><b>Definition:</b> PxShape.h:142</div></div>
 <div class="ttc" id="classPxRigidActor_html"><div class="ttname"><a href="classPxRigidActor.html">PxRigidActor</a></div><div class="ttdoc">PxRigidActor represents a base class shared between dynamic and static rigid bodies in the physics SD...</div><div class="ttdef"><b>Definition:</b> PxRigidActor.h:58</div></div>
 <div class="ttc" id="classPxArticulationLink_html"><div class="ttname"><a href="classPxArticulationLink.html">PxArticulationLink</a></div><div class="ttdoc">a component of an articulation that represents a rigid body </div><div class="ttdef"><b>Definition:</b> PxArticulationLink.h:57</div></div>
 <div class="ttc" id="classPxArticulationReducedCoordinate_html"><div class="ttname"><a href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></div><div class="ttdoc">a tree structure of bodies connected by joints that is treated as a unit by the dynamics solver ...</div><div class="ttdef"><b>Definition:</b> PxArticulationReducedCoordinate.h:144</div></div>
-<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:112</div></div>
+<div class="ttc" id="classPxArticulationJointBase_html"><div class="ttname"><a href="classPxArticulationJointBase.html">PxArticulationJointBase</a></div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:110</div></div>
 <div class="ttc" id="classPxConstraint_html"><div class="ttname"><a href="classPxConstraint.html">PxConstraint</a></div><div class="ttdoc">A plugin class for implementing constraints. </div><div class="ttdef"><b>Definition:</b> PxConstraint.h:108</div></div>
 <div class="ttc" id="classPxTolerancesScale_html"><div class="ttname"><a href="classPxTolerancesScale.html">PxTolerancesScale</a></div><div class="ttdoc">Class to define the scale at which simulation runs. Most simulation tolerances are calculated in term...</div><div class="ttdef"><b>Definition:</b> PxTolerancesScale.h:56</div></div>
 <div class="ttc" id="classPxMaterial_html"><div class="ttname"><a href="classPxMaterial.html">PxMaterial</a></div><div class="ttdoc">Material class to represent a set of surface properties. </div><div class="ttdef"><b>Definition:</b> PxMaterial.h:130</div></div>
diff --git a/physx/documentation/PhysXAPI/files/PxRigidDynamic_8h_source.html b/physx/documentation/PhysXAPI/files/PxRigidDynamic_8h_source.html
index 9973f9c94..179fa85d3 100644
--- a/physx/documentation/PhysXAPI/files/PxRigidDynamic_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxRigidDynamic_8h_source.html
@@ -86,28 +86,27 @@
 <div class="title">PxRigidDynamic.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="PxRigidDynamic_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_NX_RIGIDDYNAMIC</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_NX_RIGIDDYNAMIC</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxRigidBody_8h.html">PxRigidBody.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;{</div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;</div><div class="line"><a name="l00050"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html">   50</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxRigidDynamicLockFlag.html">PxRigidDynamicLockFlag</a></div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;{</div><div class="line"><a name="l00052"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">   52</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">Enum</a></div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    {</div><div class="line"><a name="l00054"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1a3c845939174811074c1829316c942efa">   54</a></span>&#160;        eLOCK_LINEAR_X = (1 &lt;&lt; 0),</div><div class="line"><a name="l00055"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1a1ea9c4f30ecf9f98b6a9057b805fee50">   55</a></span>&#160;        eLOCK_LINEAR_Y = (1 &lt;&lt; 1),</div><div class="line"><a name="l00056"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1a92189af0472e35c788923f75be44396f">   56</a></span>&#160;        eLOCK_LINEAR_Z = (1 &lt;&lt; 2),</div><div class="line"><a name="l00057"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1acd29b90fdd453256db7d95f8d168ffd4">   57</a></span>&#160;        eLOCK_ANGULAR_X = (1 &lt;&lt; 3),</div><div class="line"><a name="l00058"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1aa8df9ce7f49c12ce3f686bd862f86fe6">   58</a></span>&#160;        eLOCK_ANGULAR_Y = (1 &lt;&lt; 4),</div><div class="line"><a name="l00059"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1ae18e7d6c8b2eb5281e3f8ef8c8502133">   59</a></span>&#160;        eLOCK_ANGULAR_Z = (1 &lt;&lt; 5)</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;    };</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;};</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;</div><div class="line"><a name="l00063"></a><span class="lineno"><a class="line" href="group__physics.html#gab35e15599f740af43f399ed4ad9f04e8">   63</a></span>&#160;<span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxRigidDynamicLockFlag::Enum, PxU16&gt;</a> <a class="code" href="group__physics.html#gab35e15599f740af43f399ed4ad9f04e8">PxRigidDynamicLockFlags</a>;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;PX_FLAGS_OPERATORS(<a class="code" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">PxRigidDynamicLockFlag::Enum</a>, <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>)</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;</div><div class="line"><a name="l00083"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html">   83</a></span>&#160;class <a class="code" href="classPxRigidDynamic.html">PxRigidDynamic</a> : public <a class="code" href="classPxRigidBody.html">PxRigidBody</a></div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;{</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;    <span class="comment">// Runtime modifications</span></div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;</div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;<span class="comment">/************************************************************************************************/</span></div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setKinematicTarget(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; destination) = 0;</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                getKinematicTarget(<a class="code" href="classPxTransform.html">PxTransform</a>&amp; target) <span class="keyword">const</span>   = 0;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setKinematicSurfaceVelocity(<span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; linearVelocity, <span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; angularVelocity) = 0;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;</div><div class="line"><a name="l00146"></a><span class="lineno">  146</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                clearKinematicSurfaceVelocity() = 0;</div><div class="line"><a name="l00147"></a><span class="lineno">  147</span>&#160;</div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;</div><div class="line"><a name="l00149"></a><span class="lineno">  149</span>&#160;<span class="comment">/************************************************************************************************/</span></div><div class="line"><a name="l00187"></a><span class="lineno">  187</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                isSleeping() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00188"></a><span class="lineno">  188</span>&#160;</div><div class="line"><a name="l00189"></a><span class="lineno">  189</span>&#160;</div><div class="line"><a name="l00201"></a><span class="lineno">  201</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setSleepThreshold(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> threshold) = 0;</div><div class="line"><a name="l00202"></a><span class="lineno">  202</span>&#160;</div><div class="line"><a name="l00210"></a><span class="lineno">  210</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getSleepThreshold() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00211"></a><span class="lineno">  211</span>&#160;</div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setStabilizationThreshold(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> threshold) = 0;</div><div class="line"><a name="l00226"></a><span class="lineno">  226</span>&#160;</div><div class="line"><a name="l00236"></a><span class="lineno">  236</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getStabilizationThreshold() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00237"></a><span class="lineno">  237</span>&#160;</div><div class="line"><a name="l00238"></a><span class="lineno">  238</span>&#160;</div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxFlags.html">PxRigidDynamicLockFlags</a> getRigidDynamicLockFlags() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;</div><div class="line"><a name="l00263"></a><span class="lineno">  263</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setRigidDynamicLockFlag(<a class="code" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">PxRigidDynamicLockFlag::Enum</a> flag, <span class="keywordtype">bool</span> value) = 0;</div><div class="line"><a name="l00264"></a><span class="lineno">  264</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setRigidDynamicLockFlags(<a class="code" href="classPxFlags.html">PxRigidDynamicLockFlags</a> <a class="code" href="PxConstraintDesc_8h.html#acfdd81caa30ceb0af5fafb4064b1bc67">flags</a>) = 0;</div><div class="line"><a name="l00265"></a><span class="lineno">  265</span>&#160;    </div><div class="line"><a name="l00266"></a><span class="lineno">  266</span>&#160;</div><div class="line"><a name="l00267"></a><span class="lineno">  267</span>&#160;</div><div class="line"><a name="l00288"></a><span class="lineno">  288</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setWakeCounter(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> wakeCounterValue) = 0;</div><div class="line"><a name="l00289"></a><span class="lineno">  289</span>&#160;</div><div class="line"><a name="l00297"></a><span class="lineno">  297</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getWakeCounter() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00298"></a><span class="lineno">  298</span>&#160;</div><div class="line"><a name="l00313"></a><span class="lineno">  313</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                wakeUp() = 0;</div><div class="line"><a name="l00314"></a><span class="lineno">  314</span>&#160;</div><div class="line"><a name="l00329"></a><span class="lineno">  329</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                putToSleep() = 0;</div><div class="line"><a name="l00330"></a><span class="lineno">  330</span>&#160;</div><div class="line"><a name="l00331"></a><span class="lineno">  331</span>&#160;<span class="comment">/************************************************************************************************/</span></div><div class="line"><a name="l00332"></a><span class="lineno">  332</span>&#160;</div><div class="line"><a name="l00351"></a><span class="lineno">  351</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setSolverIterationCounts(<a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> minPositionIters, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> minVelocityIters = 1) = 0;</div><div class="line"><a name="l00352"></a><span class="lineno">  352</span>&#160;</div><div class="line"><a name="l00358"></a><span class="lineno">  358</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                getSolverIterationCounts(<a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>&amp; minPositionIters, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>&amp; minVelocityIters) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00359"></a><span class="lineno">  359</span>&#160;</div><div class="line"><a name="l00379"></a><span class="lineno">  379</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getContactReportThreshold() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00380"></a><span class="lineno">  380</span>&#160;</div><div class="line"><a name="l00390"></a><span class="lineno">  390</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setContactReportThreshold(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> threshold) = 0;</div><div class="line"><a name="l00391"></a><span class="lineno">  391</span>&#160;</div><div class="line"><a name="l00392"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">  392</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keyword">const</span> <span class="keywordtype">char</span>*         <a class="code" href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">getConcreteTypeName</a>()<span class="keyword"> const </span>{ <span class="keywordflow">return</span> <span class="stringliteral">&quot;PxRigidDynamic&quot;</span>; }</div><div class="line"><a name="l00393"></a><span class="lineno">  393</span>&#160;</div><div class="line"><a name="l00394"></a><span class="lineno">  394</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00395"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">  395</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                       <a class="code" href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">PxRigidDynamic</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxRigidBody.html">PxRigidBody</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00396"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">  396</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                       <a class="code" href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">PxRigidDynamic</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxRigidBody.html">PxRigidBody</a>(baseFlags) {}</div><div class="line"><a name="l00397"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273">  397</a></span>&#160;    <span class="keyword">virtual</span>                         <a class="code" href="classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273">~PxRigidDynamic</a>() {}</div><div class="line"><a name="l00398"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">  398</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                <a class="code" href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">isKindOf</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* name)<span class="keyword"> const </span>{ <span class="keywordflow">return</span> !::strcmp(<span class="stringliteral">&quot;PxRigidDynamic&quot;</span>, name) || <a class="code" href="classPxRigidBody.html#a126446a6de640cdc08d339911f826270">PxRigidBody::isKindOf</a>(name); }</div><div class="line"><a name="l00399"></a><span class="lineno">  399</span>&#160;</div><div class="line"><a name="l00400"></a><span class="lineno">  400</span>&#160;};</div><div class="line"><a name="l00401"></a><span class="lineno">  401</span>&#160;</div><div class="line"><a name="l00402"></a><span class="lineno">  402</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00403"></a><span class="lineno">  403</span>&#160;} <span class="comment">// namespace physx</span></div><div class="line"><a name="l00404"></a><span class="lineno">  404</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00405"></a><span class="lineno">  405</span>&#160;</div><div class="line"><a name="l00407"></a><span class="lineno">  407</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
+<a href="PxRigidDynamic_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  </span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;</div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#ifndef PX_PHYSICS_NX_RIGIDDYNAMIC</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;<span class="preprocessor">#define PX_PHYSICS_NX_RIGIDDYNAMIC</span></div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxRigidBody_8h.html">PxRigidBody.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;</div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;{</div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;</div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;</div><div class="line"><a name="l00050"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html">   50</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxRigidDynamicLockFlag.html">PxRigidDynamicLockFlag</a></div><div class="line"><a name="l00051"></a><span class="lineno">   51</span>&#160;{</div><div class="line"><a name="l00052"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">   52</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">Enum</a></div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    {</div><div class="line"><a name="l00054"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1a3c845939174811074c1829316c942efa">   54</a></span>&#160;        eLOCK_LINEAR_X = (1 &lt;&lt; 0),</div><div class="line"><a name="l00055"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1a1ea9c4f30ecf9f98b6a9057b805fee50">   55</a></span>&#160;        eLOCK_LINEAR_Y = (1 &lt;&lt; 1),</div><div class="line"><a name="l00056"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1a92189af0472e35c788923f75be44396f">   56</a></span>&#160;        eLOCK_LINEAR_Z = (1 &lt;&lt; 2),</div><div class="line"><a name="l00057"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1acd29b90fdd453256db7d95f8d168ffd4">   57</a></span>&#160;        eLOCK_ANGULAR_X = (1 &lt;&lt; 3),</div><div class="line"><a name="l00058"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1aa8df9ce7f49c12ce3f686bd862f86fe6">   58</a></span>&#160;        eLOCK_ANGULAR_Y = (1 &lt;&lt; 4),</div><div class="line"><a name="l00059"></a><span class="lineno"><a class="line" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1ae18e7d6c8b2eb5281e3f8ef8c8502133">   59</a></span>&#160;        eLOCK_ANGULAR_Z = (1 &lt;&lt; 5)</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;    };</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;};</div><div class="line"><a name="l00062"></a><span class="lineno">   62</span>&#160;</div><div class="line"><a name="l00063"></a><span class="lineno"><a class="line" href="group__physics.html#gab35e15599f740af43f399ed4ad9f04e8">   63</a></span>&#160;<span class="keyword">typedef</span> <a class="code" href="classPxFlags.html">PxFlags&lt;PxRigidDynamicLockFlag::Enum, PxU16&gt;</a> <a class="code" href="group__physics.html#gab35e15599f740af43f399ed4ad9f04e8">PxRigidDynamicLockFlags</a>;</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;PX_FLAGS_OPERATORS(<a class="code" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">PxRigidDynamicLockFlag::Enum</a>, <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>)</div><div class="line"><a name="l00065"></a><span class="lineno">   65</span>&#160;</div><div class="line"><a name="l00066"></a><span class="lineno">   66</span>&#160;</div><div class="line"><a name="l00083"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html">   83</a></span>&#160;class <a class="code" href="classPxRigidDynamic.html">PxRigidDynamic</a> : public <a class="code" href="classPxRigidBody.html">PxRigidBody</a></div><div class="line"><a name="l00084"></a><span class="lineno">   84</span>&#160;{</div><div class="line"><a name="l00085"></a><span class="lineno">   85</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00086"></a><span class="lineno">   86</span>&#160;    <span class="comment">// Runtime modifications</span></div><div class="line"><a name="l00087"></a><span class="lineno">   87</span>&#160;</div><div class="line"><a name="l00088"></a><span class="lineno">   88</span>&#160;</div><div class="line"><a name="l00089"></a><span class="lineno">   89</span>&#160;<span class="comment">/************************************************************************************************/</span></div><div class="line"><a name="l00117"></a><span class="lineno">  117</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setKinematicTarget(<span class="keyword">const</span> <a class="code" href="classPxTransform.html">PxTransform</a>&amp; destination) = 0;</div><div class="line"><a name="l00118"></a><span class="lineno">  118</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                getKinematicTarget(<a class="code" href="classPxTransform.html">PxTransform</a>&amp; target) <span class="keyword">const</span>   = 0;</div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;</div><div class="line"><a name="l00129"></a><span class="lineno">  129</span>&#160;</div><div class="line"><a name="l00130"></a><span class="lineno">  130</span>&#160;<span class="comment">/************************************************************************************************/</span></div><div class="line"><a name="l00168"></a><span class="lineno">  168</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                isSleeping() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;</div><div class="line"><a name="l00170"></a><span class="lineno">  170</span>&#160;</div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setSleepThreshold(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> threshold) = 0;</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;</div><div class="line"><a name="l00191"></a><span class="lineno">  191</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getSleepThreshold() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00192"></a><span class="lineno">  192</span>&#160;</div><div class="line"><a name="l00206"></a><span class="lineno">  206</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setStabilizationThreshold(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> threshold) = 0;</div><div class="line"><a name="l00207"></a><span class="lineno">  207</span>&#160;</div><div class="line"><a name="l00217"></a><span class="lineno">  217</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getStabilizationThreshold() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00218"></a><span class="lineno">  218</span>&#160;</div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="classPxFlags.html">PxRigidDynamicLockFlags</a> getRigidDynamicLockFlags() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00230"></a><span class="lineno">  230</span>&#160;</div><div class="line"><a name="l00244"></a><span class="lineno">  244</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setRigidDynamicLockFlag(<a class="code" href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">PxRigidDynamicLockFlag::Enum</a> flag, <span class="keywordtype">bool</span> value) = 0;</div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setRigidDynamicLockFlags(<a class="code" href="classPxFlags.html">PxRigidDynamicLockFlags</a> <a class="code" href="PxConstraintDesc_8h.html#acfdd81caa30ceb0af5fafb4064b1bc67">flags</a>) = 0;</div><div class="line"><a name="l00246"></a><span class="lineno">  246</span>&#160;    </div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;</div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;</div><div class="line"><a name="l00269"></a><span class="lineno">  269</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setWakeCounter(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> wakeCounterValue) = 0;</div><div class="line"><a name="l00270"></a><span class="lineno">  270</span>&#160;</div><div class="line"><a name="l00278"></a><span class="lineno">  278</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getWakeCounter() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00279"></a><span class="lineno">  279</span>&#160;</div><div class="line"><a name="l00294"></a><span class="lineno">  294</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                wakeUp() = 0;</div><div class="line"><a name="l00295"></a><span class="lineno">  295</span>&#160;</div><div class="line"><a name="l00310"></a><span class="lineno">  310</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                putToSleep() = 0;</div><div class="line"><a name="l00311"></a><span class="lineno">  311</span>&#160;</div><div class="line"><a name="l00312"></a><span class="lineno">  312</span>&#160;<span class="comment">/************************************************************************************************/</span></div><div class="line"><a name="l00313"></a><span class="lineno">  313</span>&#160;</div><div class="line"><a name="l00332"></a><span class="lineno">  332</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setSolverIterationCounts(<a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> minPositionIters, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> minVelocityIters = 1) = 0;</div><div class="line"><a name="l00333"></a><span class="lineno">  333</span>&#160;</div><div class="line"><a name="l00339"></a><span class="lineno">  339</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                getSolverIterationCounts(<a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>&amp; minPositionIters, <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>&amp; minVelocityIters) <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00340"></a><span class="lineno">  340</span>&#160;</div><div class="line"><a name="l00360"></a><span class="lineno">  360</span>&#160;    <span class="keyword">virtual</span>     <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>              getContactReportThreshold() <span class="keyword">const</span> = 0;</div><div class="line"><a name="l00361"></a><span class="lineno">  361</span>&#160;</div><div class="line"><a name="l00371"></a><span class="lineno">  371</span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">void</span>                setContactReportThreshold(<a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> threshold) = 0;</div><div class="line"><a name="l00372"></a><span class="lineno">  372</span>&#160;</div><div class="line"><a name="l00373"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">  373</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keyword">const</span> <span class="keywordtype">char</span>*         <a class="code" href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">getConcreteTypeName</a>()<span class="keyword"> const </span>{ <span class="keywordflow">return</span> <span class="stringliteral">&quot;PxRigidDynamic&quot;</span>; }</div><div class="line"><a name="l00374"></a><span class="lineno">  374</span>&#160;</div><div class="line"><a name="l00375"></a><span class="lineno">  375</span>&#160;<span class="keyword">protected</span>:</div><div class="line"><a name="l00376"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">  376</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                       <a class="code" href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">PxRigidDynamic</a>(<a class="code" href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a> concreteType, <a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxRigidBody.html">PxRigidBody</a>(concreteType, baseFlags) {}</div><div class="line"><a name="l00377"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">  377</a></span>&#160;    <a class="code" href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a>                       <a class="code" href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">PxRigidDynamic</a>(<a class="code" href="classPxFlags.html">PxBaseFlags</a> baseFlags) : <a class="code" href="classPxRigidBody.html">PxRigidBody</a>(baseFlags) {}</div><div class="line"><a name="l00378"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273">  378</a></span>&#160;    <span class="keyword">virtual</span>                         <a class="code" href="classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273">~PxRigidDynamic</a>() {}</div><div class="line"><a name="l00379"></a><span class="lineno"><a class="line" href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">  379</a></span>&#160;    <span class="keyword">virtual</span>     <span class="keywordtype">bool</span>                <a class="code" href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">isKindOf</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* name)<span class="keyword"> const </span>{ <span class="keywordflow">return</span> !::strcmp(<span class="stringliteral">&quot;PxRigidDynamic&quot;</span>, name) || <a class="code" href="classPxRigidBody.html#a126446a6de640cdc08d339911f826270">PxRigidBody::isKindOf</a>(name); }</div><div class="line"><a name="l00380"></a><span class="lineno">  380</span>&#160;</div><div class="line"><a name="l00381"></a><span class="lineno">  381</span>&#160;};</div><div class="line"><a name="l00382"></a><span class="lineno">  382</span>&#160;</div><div class="line"><a name="l00383"></a><span class="lineno">  383</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00384"></a><span class="lineno">  384</span>&#160;} <span class="comment">// namespace physx</span></div><div class="line"><a name="l00385"></a><span class="lineno">  385</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00386"></a><span class="lineno">  386</span>&#160;</div><div class="line"><a name="l00388"></a><span class="lineno">  388</span>&#160;<span class="preprocessor">#endif</span></div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
 <div class="ttc" id="classPxRigidDynamic_html"><div class="ttname"><a href="classPxRigidDynamic.html">PxRigidDynamic</a></div><div class="ttdoc">PxRigidDynamic represents a dynamic rigid simulation object in the physics SDK. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:83</div></div>
 <div class="ttc" id="structPxRigidDynamicLockFlag_html"><div class="ttname"><a href="structPxRigidDynamicLockFlag.html">PxRigidDynamicLockFlag</a></div><div class="ttdoc">Collection of flags providing a mechanism to lock motion along/around a specific axis. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:50</div></div>
 <div class="ttc" id="namespacephysx_html_a727d2d8426e2a21ebbc522fa65c3f97a"><div class="ttname"><a href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">physx::PxReal</a></div><div class="ttdeci">float PxReal</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:78</div></div>
-<div class="ttc" id="classPxRigidDynamic_html_a9e63c1ec6872048e16f7d6b744ec3eef"><div class="ttname"><a href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">PxRigidDynamic::PxRigidDynamic</a></div><div class="ttdeci">PX_INLINE PxRigidDynamic(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:396</div></div>
+<div class="ttc" id="classPxRigidDynamic_html_a9e63c1ec6872048e16f7d6b744ec3eef"><div class="ttname"><a href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">PxRigidDynamic::PxRigidDynamic</a></div><div class="ttdeci">PX_INLINE PxRigidDynamic(PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:377</div></div>
 <div class="ttc" id="structPxRigidDynamicLockFlag_html_ae2e527a7cf32504d4b5c8c6d147280e1"><div class="ttname"><a href="structPxRigidDynamicLockFlag.html#ae2e527a7cf32504d4b5c8c6d147280e1">PxRigidDynamicLockFlag::Enum</a></div><div class="ttdeci">Enum</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:52</div></div>
-<div class="ttc" id="classPxRigidDynamic_html_a2e1d23ebdce36455824949be867128d1"><div class="ttname"><a href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">PxRigidDynamic::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *name) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:398</div></div>
+<div class="ttc" id="classPxRigidDynamic_html_a2e1d23ebdce36455824949be867128d1"><div class="ttname"><a href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">PxRigidDynamic::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *name) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:379</div></div>
 <div class="ttc" id="PxRigidBody_8h_html"><div class="ttname"><a href="PxRigidBody_8h.html">PxRigidBody.h</a></div></div>
 <div class="ttc" id="group__common_html_gac1fb4b256a5d900d394e89db170a2b79"><div class="ttname"><a href="group__common.html#gac1fb4b256a5d900d394e89db170a2b79">PxType</a></div><div class="ttdeci">PxU16 PxType</div><div class="ttdef"><b>Definition:</b> PxBase.h:49</div></div>
 <div class="ttc" id="PxConstraintDesc_8h_html_acfdd81caa30ceb0af5fafb4064b1bc67"><div class="ttname"><a href="PxConstraintDesc_8h.html#acfdd81caa30ceb0af5fafb4064b1bc67">flags</a></div><div class="ttdeci">PxU16 flags</div><div class="ttdoc">a set of Px1DConstraintFlags </div><div class="ttdef"><b>Definition:</b> PxConstraintDesc.h:110</div></div>
 <div class="ttc" id="classPxTransform_html"><div class="ttname"><a href="classPxTransform.html">PxTransform</a></div><div class="ttdoc">class representing a rigid euclidean transform as a quaternion and a vector </div><div class="ttdef"><b>Definition:</b> PxTransform.h:48</div></div>
-<div class="ttc" id="classPxRigidDynamic_html_a43150714d9e16cd056149bad325f0527"><div class="ttname"><a href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">PxRigidDynamic::getConcreteTypeName</a></div><div class="ttdeci">virtual const char * getConcreteTypeName() const</div><div class="ttdoc">Returns string name of dynamic type. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:392</div></div>
+<div class="ttc" id="classPxRigidDynamic_html_a43150714d9e16cd056149bad325f0527"><div class="ttname"><a href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">PxRigidDynamic::getConcreteTypeName</a></div><div class="ttdeci">virtual const char * getConcreteTypeName() const</div><div class="ttdoc">Returns string name of dynamic type. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:373</div></div>
 <div class="ttc" id="group__physics_html_gab35e15599f740af43f399ed4ad9f04e8"><div class="ttname"><a href="group__physics.html#gab35e15599f740af43f399ed4ad9f04e8">PxRigidDynamicLockFlags</a></div><div class="ttdeci">PxFlags&lt; PxRigidDynamicLockFlag::Enum, PxU16 &gt; PxRigidDynamicLockFlags</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:63</div></div>
 <div class="ttc" id="classPxFlags_html"><div class="ttname"><a href="classPxFlags.html">PxFlags</a></div><div class="ttdoc">Container for bitfield flag variables associated with a specific enum type. </div><div class="ttdef"><b>Definition:</b> PxFlags.h:73</div></div>
 <div class="ttc" id="classPxRigidBody_html"><div class="ttname"><a href="classPxRigidBody.html">PxRigidBody</a></div><div class="ttdoc">PxRigidBody is a base class shared between dynamic rigid body objects. </div><div class="ttdef"><b>Definition:</b> PxRigidBody.h:157</div></div>
 <div class="ttc" id="classPxRigidBody_html_a126446a6de640cdc08d339911f826270"><div class="ttname"><a href="classPxRigidBody.html#a126446a6de640cdc08d339911f826270">PxRigidBody::isKindOf</a></div><div class="ttdeci">virtual bool isKindOf(const char *name) const</div><div class="ttdoc">Returns whether a given type name matches with the type of this instance. </div><div class="ttdef"><b>Definition:</b> PxRigidBody.h:692</div></div>
 <div class="ttc" id="namespacephysx_html_a9caf1cbcda071b6d2a9c069faa99da23"><div class="ttname"><a href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">physx::PxU16</a></div><div class="ttdeci">uint16_t PxU16</div><div class="ttdef"><b>Definition:</b> PxSimpleTypes.h:73</div></div>
 <div class="ttc" id="group__foundation_html_gacce5749db3dcfb916e98c253374264ed"><div class="ttname"><a href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a></div><div class="ttdeci">uint32_t PxU32</div><div class="ttdef"><b>Definition:</b> Px.h:48</div></div>
-<div class="ttc" id="classPxRigidDynamic_html_ab21ed0e4e1be20fc975fcd3ceeb4e064"><div class="ttname"><a href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">PxRigidDynamic::PxRigidDynamic</a></div><div class="ttdeci">PX_INLINE PxRigidDynamic(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:395</div></div>
+<div class="ttc" id="classPxRigidDynamic_html_ab21ed0e4e1be20fc975fcd3ceeb4e064"><div class="ttname"><a href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">PxRigidDynamic::PxRigidDynamic</a></div><div class="ttdeci">PX_INLINE PxRigidDynamic(PxType concreteType, PxBaseFlags baseFlags)</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:376</div></div>
 <div class="ttc" id="group__foundation_html_gacb03347b642a2a5bdea1f9b305a6fbec"><div class="ttname"><a href="group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec">PX_INLINE</a></div><div class="ttdeci">#define PX_INLINE</div><div class="ttdef"><b>Definition:</b> PxPreprocessor.h:349</div></div>
-<div class="ttc" id="classPxRigidDynamic_html_a9380b2f317161026b39c6a296ba9e273"><div class="ttname"><a href="classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273">PxRigidDynamic::~PxRigidDynamic</a></div><div class="ttdeci">virtual ~PxRigidDynamic()</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:397</div></div>
-<div class="ttc" id="classPxVec3_html"><div class="ttname"><a href="classPxVec3.html">PxVec3</a></div><div class="ttdoc">3 Element vector class. </div><div class="ttdef"><b>Definition:</b> PxVec3.h:49</div></div>
+<div class="ttc" id="classPxRigidDynamic_html_a9380b2f317161026b39c6a296ba9e273"><div class="ttname"><a href="classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273">PxRigidDynamic::~PxRigidDynamic</a></div><div class="ttdeci">virtual ~PxRigidDynamic()</div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:378</div></div>
 </div><!-- fragment --></div><!-- contents -->
 </div><!-- doc-content -->
 <!-- HTML footer for doxygen 1.8.14-->
diff --git a/physx/documentation/PhysXAPI/files/PxSolverDefs_8h_source.html b/physx/documentation/PhysXAPI/files/PxSolverDefs_8h_source.html
index 2abedaff9..71efdd506 100644
--- a/physx/documentation/PhysXAPI/files/PxSolverDefs_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxSolverDefs_8h_source.html
@@ -86,8 +86,9 @@
 <div class="title">PxSolverDefs.h</div>  </div>
 </div><!--header-->
 <div class="contents">
-<a href="PxSolverDefs_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;<span class="preprocessor">#ifndef PX_SOLVER_DEFS_H</span></div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#define PX_SOLVER_DEFS_H</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;</div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxPhysXConfig_8h.html">PxPhysXConfig.h</a>&quot;</span></div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxVec3_8h.html">foundation/PxVec3.h</a>&quot;</span></div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxMat33_8h.html">foundation/PxMat33.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxTransform_8h.html">foundation/PxTransform.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxConstraintDesc_8h.html">PxConstraintDesc.h</a>&quot;</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="GuContactPoint_8h.html">geomutils/GuContactPoint.h</a>&quot;</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="preprocessor">#pragma warning(push)</span></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;<span class="preprocessor">#pragma warning(disable : 4324) // structure was padded due to alignment</span></div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;{</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;</div><div class="line"><a name="l00051"></a><span class="lineno"><a class="line" href="namespaceDy.html">   51</a></span>&#160;<span class="keyword">namespace </span><a class="code" href="namespaceDy.html">Dy</a></div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;{</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    <span class="comment">//struct FsData;</span></div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;    <span class="keyword">class </span>ArticulationV;</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;}</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;</div><div class="line"><a name="l00057"></a><span class="lineno"><a class="line" href="namespaceSc.html">   57</a></span>&#160;<span class="keyword">namespace </span><a class="code" href="namespaceSc.html">Sc</a></div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;{</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;    <span class="keyword">class </span>ShapeInteraction;</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;}</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;</div><div class="line"><a name="l00062"></a><span class="lineno"><a class="line" href="structPxSolverBody.html">   62</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverBody.html">PxSolverBody</a></div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;{</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, <a class="code" href="classPxVec3.html">PxVec3</a>) linearVelocity;                    </div><div class="line"><a name="l00065"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#a94d862d029f4dba6de658d2b84b6affc">   65</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverBody.html#a94d862d029f4dba6de658d2b84b6affc">maxSolverNormalProgress</a>;        </div><div class="line"><a name="l00066"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#a2e3b0fc4b9097cb2ded8aec5120299ec">   66</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverBody.html#a2e3b0fc4b9097cb2ded8aec5120299ec">maxSolverFrictionProgress</a>;      </div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;</div><div class="line"><a name="l00068"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#ab9c0a75f4b85920be4852c53c4ed2799">   68</a></span>&#160;    <a class="code" href="classPxVec3.html">PxVec3</a>              <a class="code" href="structPxSolverBody.html#ab9c0a75f4b85920be4852c53c4ed2799">angularState</a>;                   </div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;</div><div class="line"><a name="l00070"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#abe151efeabf33d524e5589fc9897fbd7">   70</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>               <a class="code" href="structPxSolverBody.html#abe151efeabf33d524e5589fc9897fbd7">solverProgress</a>;                 </div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;</div><div class="line"><a name="l00072"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#afa5845fa664df156d2a25300ea00593e">   72</a></span>&#160;    <a class="code" href="structPxSolverBody.html#afa5845fa664df156d2a25300ea00593e">PxSolverBody</a>() : linearVelocity(0.f), maxSolverNormalProgress(0), maxSolverFrictionProgress(0), angularState(0), solverProgress(0)</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;    {</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;    }</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;};</div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;<a class="code" href="PxSolverDefs_8h.html#a0b777f57ee7373fc869734591a7d1ba5">PX_COMPILE_TIME_ASSERT</a>(<span class="keyword">sizeof</span>(<a class="code" href="structPxSolverBody.html">PxSolverBody</a>) == 32);</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;</div><div class="line"><a name="l00079"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html">   79</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverBodyData.html">PxSolverBodyData</a></div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;{</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, <a class="code" href="classPxVec3.html">PxVec3</a> linearVelocity);    </div><div class="line"><a name="l00082"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6">   82</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6">invMass</a>;                </div><div class="line"><a name="l00083"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a2ab7bc060f7946ae4f3bb1da17018326">   83</a></span>&#160;    <a class="code" href="classPxVec3.html">PxVec3</a>          <a class="code" href="structPxSolverBodyData.html#a2ab7bc060f7946ae4f3bb1da17018326">angularVelocity</a>;        </div><div class="line"><a name="l00084"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a74e634af20acbf0a37246a5ec44f2248">   84</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#a74e634af20acbf0a37246a5ec44f2248">reportThreshold</a>;        </div><div class="line"><a name="l00085"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#aeaab06a3dfac94c9ccde5e40f966f8af">   85</a></span>&#160;    <a class="code" href="classPxMat33.html">PxMat33</a>         <a class="code" href="structPxSolverBodyData.html#aeaab06a3dfac94c9ccde5e40f966f8af">sqrtInvInertia</a>;         </div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2">   86</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2">penBiasClamp</a>;           </div><div class="line"><a name="l00087"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296">   87</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>           <a class="code" href="structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296">nodeIndex</a>;              </div><div class="line"><a name="l00088"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b">   88</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b">maxContactImpulse</a>;      </div><div class="line"><a name="l00089"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a1d32e78a191473936a6693b40185ad91">   89</a></span>&#160;    <a class="code" href="classPxTransform.html">PxTransform</a>     <a class="code" href="structPxSolverBodyData.html#a1d32e78a191473936a6693b40185ad91">body2World</a>;             </div><div class="line"><a name="l00090"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415">   90</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>           <a class="code" href="structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415">lockFlags</a>;              </div><div class="line"><a name="l00091"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f">   91</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>           <a class="code" href="structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f">hasSurfaceVelocity</a>;     </div><div class="line"><a name="l00092"></a><span class="lineno">   92</span>&#160;</div><div class="line"><a name="l00093"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#aa443999488f683690658da520f1726c9">   93</a></span>&#160;    <a class="code" href="group__foundation.html#ga6a774eed3cad34b0f636332a3d28c6bb">PX_FORCE_INLINE</a> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverBodyData.html#aa443999488f683690658da520f1726c9">projectVelocity</a>(<span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; lin, <span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; ang)<span class="keyword">    const</span></div><div class="line"><a name="l00094"></a><span class="lineno">   94</span>&#160;<span class="keyword">    </span>{</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;        <span class="keywordflow">return</span> linearVelocity.<a class="code" href="classPxVec3.html#a4c136327953ea1b09be10d979279b7dc">dot</a>(lin) + angularVelocity.<a class="code" href="classPxVec3.html#a4c136327953ea1b09be10d979279b7dc">dot</a>(ang);</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;    }</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;};</div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;</div><div class="line"><a name="l00099"></a><span class="lineno">   99</span>&#160;<span class="comment">//----------------------------------</span></div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;<span class="comment">/*</span></div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;<span class="comment">* A header that defines the size of a specific batch of constraints (of same type and without dependencies)</span></div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;<span class="comment">*/</span></div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html">  103</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxConstraintBatchHeader.html">PxConstraintBatchHeader</a></div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;{</div><div class="line"><a name="l00105"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html#a4b7fe2ed800f2340292ac8a6229c011d">  105</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxConstraintBatchHeader.html#a4b7fe2ed800f2340292ac8a6229c011d">mStartIndex</a>;          </div><div class="line"><a name="l00106"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html#ad8e475353cb5ee9d05cc5fcb12b2d25e">  106</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxConstraintBatchHeader.html#ad8e475353cb5ee9d05cc5fcb12b2d25e">mStride</a>;              </div><div class="line"><a name="l00107"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html#af4cb46c92e9051700b4accde28b5c01a">  107</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxConstraintBatchHeader.html#af4cb46c92e9051700b4accde28b5c01a">mConstraintType</a>;      </div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;};</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;</div><div class="line"><a name="l00110"></a><span class="lineno">  110</span>&#160;</div><div class="line"><a name="l00111"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html">  111</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverConstraintDesc.html">PxSolverConstraintDesc</a></div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;{</div><div class="line"><a name="l00113"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a87d502cd319544604669cb843a239fb0">  113</a></span>&#160;    <span class="keyword">static</span> <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> NO_LINK = 0xffff;</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;</div><div class="line"><a name="l00115"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150">  115</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150">ConstraintType</a></div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;    {</div><div class="line"><a name="l00117"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150af07aa2676bc3a0f445124a369d76178d">  117</a></span>&#160;        <a class="code" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150af07aa2676bc3a0f445124a369d76178d">eCONTACT_CONSTRAINT</a>,                </div><div class="line"><a name="l00118"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150a964367059d7c0c8b31ee93cd614a1aac">  118</a></span>&#160;        eJOINT_CONSTRAINT                   </div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;    };</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;    {</div><div class="line"><a name="l00123"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a917f6a5b6829c4c6f8f44182628e4e2e">  123</a></span>&#160;        <a class="code" href="structPxSolverBody.html">PxSolverBody</a>*               <a class="code" href="structPxSolverConstraintDesc.html#a917f6a5b6829c4c6f8f44182628e4e2e">bodyA</a>;          </div><div class="line"><a name="l00124"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a591b8d338b0e9932d67f405542776065">  124</a></span>&#160;        Dy::ArticulationV*          <a class="code" href="structPxSolverConstraintDesc.html#a591b8d338b0e9932d67f405542776065">articulationA</a>;  </div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;    };</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;    {</div><div class="line"><a name="l00129"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a6144adf5b9396847a54da7d7c17a770f">  129</a></span>&#160;        <a class="code" href="structPxSolverBody.html">PxSolverBody</a>*               <a class="code" href="structPxSolverConstraintDesc.html#a6144adf5b9396847a54da7d7c17a770f">bodyB</a>;          </div><div class="line"><a name="l00130"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a1452b23eaf124e9740f26e4bb98477ca">  130</a></span>&#160;        Dy::ArticulationV*          <a class="code" href="structPxSolverConstraintDesc.html#a1452b23eaf124e9740f26e4bb98477ca">articulationB</a>;  </div><div class="line"><a name="l00131"></a><span class="lineno">  131</span>&#160;    };</div><div class="line"><a name="l00132"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#ab7d352db61fd0a031d9574a4322e629d">  132</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#ab7d352db61fd0a031d9574a4322e629d">linkIndexA</a>;         </div><div class="line"><a name="l00133"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a833220f6cf3fef57bee54a4cfaa81049">  133</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#a833220f6cf3fef57bee54a4cfaa81049">linkIndexB</a>;         </div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;    {</div><div class="line"><a name="l00136"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#abada1c0d78bba24998736ead4bd808fb">  136</a></span>&#160;        <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#abada1c0d78bba24998736ead4bd808fb">articulationALength</a>;    </div><div class="line"><a name="l00137"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a75d8cc26a512bd1e1a1e1a965decfa83">  137</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>               <a class="code" href="structPxSolverConstraintDesc.html#a75d8cc26a512bd1e1a1e1a965decfa83">bodyADataIndex</a>;         </div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;    };</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;    {</div><div class="line"><a name="l00142"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a911f59c5a7b2dcf4ed5991ec1b6d7153">  142</a></span>&#160;        <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#a911f59c5a7b2dcf4ed5991ec1b6d7153">articulationBLength</a>;    </div><div class="line"><a name="l00143"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a0e7c10b35dcc40b914857957fa64beb2">  143</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>               <a class="code" href="structPxSolverConstraintDesc.html#a0e7c10b35dcc40b914857957fa64beb2">bodyBDataIndex</a>;         </div><div class="line"><a name="l00144"></a><span class="lineno">  144</span>&#160;    };</div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;</div><div class="line"><a name="l00146"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aaa77283c5efaa8f2b11611881729e8b8">  146</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>                   <a class="code" href="structPxSolverConstraintDesc.html#aaa77283c5efaa8f2b11611881729e8b8">writeBackLengthOver4</a>;   </div><div class="line"><a name="l00147"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a417fed9862da2ced60f98353fea74ace">  147</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>                   <a class="code" href="structPxSolverConstraintDesc.html#a417fed9862da2ced60f98353fea74ace">constraintLengthOver16</a>; </div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;</div><div class="line"><a name="l00149"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a7a577215addd09b9e092aae536157161">  149</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>*                   <a class="code" href="structPxSolverConstraintDesc.html#a7a577215addd09b9e092aae536157161">constraint</a>;             </div><div class="line"><a name="l00150"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#ae9b299864d3b98e18fcfc6a089bb7f79">  150</a></span>&#160;    <span class="keywordtype">void</span>*                   <a class="code" href="structPxSolverConstraintDesc.html#ae9b299864d3b98e18fcfc6a089bb7f79">writeBack</a>;              </div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;};</div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;</div><div class="line"><a name="l00153"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html">  153</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverConstraintPrepDescBase.html">PxSolverConstraintPrepDescBase</a></div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;{</div><div class="line"><a name="l00155"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">  155</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">BodyState</a></div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;    {</div><div class="line"><a name="l00157"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa6f030c0742671e6fe38c87ff48eba4ca">  157</a></span>&#160;        eDYNAMIC_BODY = 1 &lt;&lt; 0,</div><div class="line"><a name="l00158"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa558ab36462aa417f686f03eabf3db6b3">  158</a></span>&#160;        eSTATIC_BODY = 1 &lt;&lt; 1,</div><div class="line"><a name="l00159"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa60a5b498ccb90a2f703fb277b7784807">  159</a></span>&#160;        eKINEMATIC_BODY = 1 &lt;&lt; 2,</div><div class="line"><a name="l00160"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa55d8f5d447451c19b05290583597ce1c">  160</a></span>&#160;        eARTICULATION = 1 &lt;&lt; 3</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;    };</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;</div><div class="line"><a name="l00163"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a3ad8129bcfce0a8891ed853a3896095d">  163</a></span>&#160;    <a class="code" href="structPxConstraintInvMassScale.html">PxConstraintInvMassScale</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a3ad8129bcfce0a8891ed853a3896095d">mInvMassScales</a>;    </div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;</div><div class="line"><a name="l00165"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#ae6a7c6dbbf6ade521ca9f485fa70ecab">  165</a></span>&#160;    <a class="code" href="structPxSolverConstraintDesc.html">PxSolverConstraintDesc</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#ae6a7c6dbbf6ade521ca9f485fa70ecab">desc</a>;               </div><div class="line"><a name="l00166"></a><span class="lineno">  166</span>&#160;</div><div class="line"><a name="l00167"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a58839e53ce16cf2d9ec3a54ead955870">  167</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBody.html">PxSolverBody</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#a58839e53ce16cf2d9ec3a54ead955870">body0</a>;                  </div><div class="line"><a name="l00168"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a6cc18f29a8da2a1e8d6af3e9fdce942c">  168</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBody.html">PxSolverBody</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#a6cc18f29a8da2a1e8d6af3e9fdce942c">body1</a>;                  </div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;</div><div class="line"><a name="l00170"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#ad58b181d4ac418941bddfd3ce8065a10">  170</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBodyData.html">PxSolverBodyData</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#ad58b181d4ac418941bddfd3ce8065a10">data0</a>;              </div><div class="line"><a name="l00171"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#aeeffe63a1a4b9f7b3a822b0f2d99a62d">  171</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBodyData.html">PxSolverBodyData</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#aeeffe63a1a4b9f7b3a822b0f2d99a62d">data1</a>;              </div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;</div><div class="line"><a name="l00173"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a251d9872155e952dcdd7dec21b9471a3">  173</a></span>&#160;    <a class="code" href="classPxTransform.html">PxTransform</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a251d9872155e952dcdd7dec21b9471a3">bodyFrame0</a>;                     </div><div class="line"><a name="l00174"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a043bdbd0966b02478725edcb027a9819">  174</a></span>&#160;    <a class="code" href="classPxTransform.html">PxTransform</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a043bdbd0966b02478725edcb027a9819">bodyFrame1</a>;                     </div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;</div><div class="line"><a name="l00176"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a4173f8c3d6662232720fdd99c9f65b9e">  176</a></span>&#160;    <a class="code" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">BodyState</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a4173f8c3d6662232720fdd99c9f65b9e">bodyState0</a>;                       </div><div class="line"><a name="l00177"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a6f2321b7e28076c3b14e30c0c9f9bdf3">  177</a></span>&#160;    <a class="code" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">BodyState</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a6f2321b7e28076c3b14e30c0c9f9bdf3">bodyState1</a>;                       </div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;};</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;</div><div class="line"><a name="l00181"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html">  181</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverConstraintPrepDesc.html">PxSolverConstraintPrepDesc</a> : <span class="keyword">public</span> <a class="code" href="structPxSolverConstraintPrepDescBase.html">PxSolverConstraintPrepDescBase</a></div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;{</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, <a class="code" href="structPx1DConstraint.html">Px1DConstraint</a>* rows);         </div><div class="line"><a name="l00184"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a70cd18a2d279f25baab00ef7838dec16">  184</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#a70cd18a2d279f25baab00ef7838dec16">numRows</a>;                              </div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;</div><div class="line"><a name="l00186"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#afb5f4b0d0b54dffbe066d8f9471474a3">  186</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#afb5f4b0d0b54dffbe066d8f9471474a3">linBreakForce</a>, angBreakForce;        </div><div class="line"><a name="l00187"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a3988b4c1ee2db31ae8fb427bef2021c1">  187</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#a3988b4c1ee2db31ae8fb427bef2021c1">minResponseThreshold</a>;                </div><div class="line"><a name="l00188"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#afcacb0bfb6ee3e939bade4d49e5ef4b0">  188</a></span>&#160;    <span class="keywordtype">void</span>* <a class="code" href="structPxSolverConstraintPrepDesc.html#afcacb0bfb6ee3e939bade4d49e5ef4b0">writeback</a>;                            </div><div class="line"><a name="l00189"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a7577dd31e280ee90740c321d452f88a9">  189</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#a7577dd31e280ee90740c321d452f88a9">disablePreprocessing</a>;                  </div><div class="line"><a name="l00190"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a3b7d111854e74252e892765ac4a3b3ad">  190</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#a3b7d111854e74252e892765ac4a3b3ad">improvedSlerp</a>;                         </div><div class="line"><a name="l00191"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#addb6a6a2962b8ce47ff7bf53db6f7759">  191</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#addb6a6a2962b8ce47ff7bf53db6f7759">driveLimitsAreForces</a>;                  </div><div class="line"><a name="l00192"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a7dd4e0d49745651ea820b9ab93ff12c5">  192</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#a7dd4e0d49745651ea820b9ab93ff12c5">extendedLimits</a>;                        </div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;</div><div class="line"><a name="l00194"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a3a5cece8fd2221a2d154c2ca2398cf48">  194</a></span>&#160;    <a class="code" href="classPxVec3.html">PxVec3</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#a3a5cece8fd2221a2d154c2ca2398cf48">body0WorldOffset</a>;                    </div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;};</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;</div><div class="line"><a name="l00198"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html">  198</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverContactDesc.html">PxSolverContactDesc</a> : <span class="keyword">public</span> <a class="code" href="structPxSolverConstraintPrepDescBase.html">PxSolverConstraintPrepDescBase</a></div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;{</div><div class="line"><a name="l00200"></a><span class="lineno">  200</span>&#160;</div><div class="line"><a name="l00201"></a><span class="lineno">  201</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, Sc::ShapeInteraction* shapeInteraction); </div><div class="line"><a name="l00202"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ac98be485f0d333ed55a1dae735286964">  202</a></span>&#160;    Gu::ContactPoint* <a class="code" href="structPxSolverContactDesc.html#ac98be485f0d333ed55a1dae735286964">contacts</a>;             </div><div class="line"><a name="l00203"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ae2a5692ebc3c99c68ac34828d15ef56d">  203</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#ae2a5692ebc3c99c68ac34828d15ef56d">numContacts</a>;                      </div><div class="line"><a name="l00204"></a><span class="lineno">  204</span>&#160;</div><div class="line"><a name="l00205"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ac19bc948958e2f2de3ca214afad28b11">  205</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverContactDesc.html#ac19bc948958e2f2de3ca214afad28b11">hasMaxImpulse</a>;                     </div><div class="line"><a name="l00206"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a7d0a19a4752992c482bc7c5d9e315927">  206</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverContactDesc.html#a7d0a19a4752992c482bc7c5d9e315927">disableStrongFriction</a>;             </div><div class="line"><a name="l00207"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ae2e4ab03c648da3de0754b5bb20470af">  207</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverContactDesc.html#ae2e4ab03c648da3de0754b5bb20470af">hasForceThresholds</a>;                </div><div class="line"><a name="l00208"></a><span class="lineno">  208</span>&#160;</div><div class="line"><a name="l00209"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a1b01af74e18c8ce096fdd19b29fe9937">  209</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverContactDesc.html#a1b01af74e18c8ce096fdd19b29fe9937">restDistance</a>;                    </div><div class="line"><a name="l00210"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a6dc38af5607703cc2fde5736f7e0ec66">  210</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverContactDesc.html#a6dc38af5607703cc2fde5736f7e0ec66">maxCCDSeparation</a>;                </div><div class="line"><a name="l00211"></a><span class="lineno">  211</span>&#160;</div><div class="line"><a name="l00212"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#aecb81b31ea78f95b634eecdc4ff787e9">  212</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>* <a class="code" href="structPxSolverContactDesc.html#aecb81b31ea78f95b634eecdc4ff787e9">frictionPtr</a>;                      </div><div class="line"><a name="l00213"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a26750cbe3e1f0b674678aa8354f46660">  213</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a> <a class="code" href="structPxSolverContactDesc.html#a26750cbe3e1f0b674678aa8354f46660">frictionCount</a>;                     </div><div class="line"><a name="l00214"></a><span class="lineno">  214</span>&#160;</div><div class="line"><a name="l00215"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a771ea901376db925ed46a6b59d3f9660">  215</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* <a class="code" href="structPxSolverContactDesc.html#a771ea901376db925ed46a6b59d3f9660">contactForces</a>;                  </div><div class="line"><a name="l00216"></a><span class="lineno">  216</span>&#160;</div><div class="line"><a name="l00217"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#afaeb10971f33613461d2c6cf1c2804fa">  217</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#afaeb10971f33613461d2c6cf1c2804fa">startFrictionPatchIndex</a>;          </div><div class="line"><a name="l00218"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ada1b86bf21e1dd614e7e4a07a9c990dd">  218</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#ada1b86bf21e1dd614e7e4a07a9c990dd">numFrictionPatches</a>;               </div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;</div><div class="line"><a name="l00220"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a79971a484d887112079ba3f980e9b0e0">  220</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#a79971a484d887112079ba3f980e9b0e0">startContactPatchIndex</a>;           </div><div class="line"><a name="l00221"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a2261ea42204894094ada177641222c9a">  221</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxSolverContactDesc.html#a2261ea42204894094ada177641222c9a">numContactPatches</a>;                </div><div class="line"><a name="l00222"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a5b54a6acfb85192b99f71bc9a72d2c1a">  222</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxSolverContactDesc.html#a5b54a6acfb85192b99f71bc9a72d2c1a">axisConstraintCount</a>;              </div><div class="line"><a name="l00223"></a><span class="lineno">  223</span>&#160;</div><div class="line"><a name="l00224"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be">  224</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a> pad[16 - <span class="keyword">sizeof</span>(<span class="keywordtype">void</span>*)];</div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;};</div><div class="line"><a name="l00226"></a><span class="lineno">  226</span>&#160;</div><div class="line"><a name="l00227"></a><span class="lineno"><a class="line" href="classPxConstraintAllocator.html">  227</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxConstraintAllocator.html">PxConstraintAllocator</a></div><div class="line"><a name="l00228"></a><span class="lineno">  228</span>&#160;{</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00235"></a><span class="lineno">  235</span>&#160;    <span class="keyword">virtual</span> <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>* reserveConstraintData(<span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> byteSize) = 0;</div><div class="line"><a name="l00242"></a><span class="lineno">  242</span>&#160;    <span class="keyword">virtual</span> <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>* reserveFrictionData(<span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> byteSize) = 0;</div><div class="line"><a name="l00243"></a><span class="lineno">  243</span>&#160;</div><div class="line"><a name="l00244"></a><span class="lineno"><a class="line" href="classPxConstraintAllocator.html#a28446a3d69923bc834d8c80938ff18be">  244</a></span>&#160;    <span class="keyword">virtual</span> <a class="code" href="classPxConstraintAllocator.html#a28446a3d69923bc834d8c80938ff18be">~PxConstraintAllocator</a>() {}</div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;};</div><div class="line"><a name="l00246"></a><span class="lineno">  246</span>&#160;</div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;}</div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00252"></a><span class="lineno">  252</span>&#160;<span class="preprocessor">#pragma warning(pop)</span></div><div class="line"><a name="l00253"></a><span class="lineno">  253</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00254"></a><span class="lineno">  254</span>&#160;</div><div class="line"><a name="l00255"></a><span class="lineno">  255</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00256"></a><span class="lineno">  256</span>&#160;</div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
+<a href="PxSolverDefs_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a name="l00001"></a><span class="lineno">    1</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00002"></a><span class="lineno">    2</span>&#160;<span class="comment">// Redistribution and use in source and binary forms, with or without</span></div><div class="line"><a name="l00003"></a><span class="lineno">    3</span>&#160;<span class="comment">// modification, are permitted provided that the following conditions</span></div><div class="line"><a name="l00004"></a><span class="lineno">    4</span>&#160;<span class="comment">// are met:</span></div><div class="line"><a name="l00005"></a><span class="lineno">    5</span>&#160;<span class="comment">//  * Redistributions of source code must retain the above copyright</span></div><div class="line"><a name="l00006"></a><span class="lineno">    6</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer.</span></div><div class="line"><a name="l00007"></a><span class="lineno">    7</span>&#160;<span class="comment">//  * Redistributions in binary form must reproduce the above copyright</span></div><div class="line"><a name="l00008"></a><span class="lineno">    8</span>&#160;<span class="comment">//    notice, this list of conditions and the following disclaimer in the</span></div><div class="line"><a name="l00009"></a><span class="lineno">    9</span>&#160;<span class="comment">//    documentation and/or other materials provided with the distribution.</span></div><div class="line"><a name="l00010"></a><span class="lineno">   10</span>&#160;<span class="comment">//  * Neither the name of NVIDIA CORPORATION nor the names of its</span></div><div class="line"><a name="l00011"></a><span class="lineno">   11</span>&#160;<span class="comment">//    contributors may be used to endorse or promote products derived</span></div><div class="line"><a name="l00012"></a><span class="lineno">   12</span>&#160;<span class="comment">//    from this software without specific prior written permission.</span></div><div class="line"><a name="l00013"></a><span class="lineno">   13</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00014"></a><span class="lineno">   14</span>&#160;<span class="comment">// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS&#39;&#39; AND ANY</span></div><div class="line"><a name="l00015"></a><span class="lineno">   15</span>&#160;<span class="comment">// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE</span></div><div class="line"><a name="l00016"></a><span class="lineno">   16</span>&#160;<span class="comment">// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span></div><div class="line"><a name="l00017"></a><span class="lineno">   17</span>&#160;<span class="comment">// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR</span></div><div class="line"><a name="l00018"></a><span class="lineno">   18</span>&#160;<span class="comment">// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,</span></div><div class="line"><a name="l00019"></a><span class="lineno">   19</span>&#160;<span class="comment">// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,</span></div><div class="line"><a name="l00020"></a><span class="lineno">   20</span>&#160;<span class="comment">// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR</span></div><div class="line"><a name="l00021"></a><span class="lineno">   21</span>&#160;<span class="comment">// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY</span></div><div class="line"><a name="l00022"></a><span class="lineno">   22</span>&#160;<span class="comment">// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT</span></div><div class="line"><a name="l00023"></a><span class="lineno">   23</span>&#160;<span class="comment">// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE</span></div><div class="line"><a name="l00024"></a><span class="lineno">   24</span>&#160;<span class="comment">// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span></div><div class="line"><a name="l00025"></a><span class="lineno">   25</span>&#160;<span class="comment">//</span></div><div class="line"><a name="l00026"></a><span class="lineno">   26</span>&#160;<span class="comment">// Copyright (c) 2008-2018 NVIDIA Corporation. All rights reserved.</span></div><div class="line"><a name="l00027"></a><span class="lineno">   27</span>&#160;<span class="comment">// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.</span></div><div class="line"><a name="l00028"></a><span class="lineno">   28</span>&#160;<span class="comment">// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.</span></div><div class="line"><a name="l00029"></a><span class="lineno">   29</span>&#160;</div><div class="line"><a name="l00030"></a><span class="lineno">   30</span>&#160;<span class="preprocessor">#ifndef PX_SOLVER_DEFS_H</span></div><div class="line"><a name="l00031"></a><span class="lineno">   31</span>&#160;<span class="preprocessor">#define PX_SOLVER_DEFS_H</span></div><div class="line"><a name="l00032"></a><span class="lineno">   32</span>&#160;</div><div class="line"><a name="l00033"></a><span class="lineno">   33</span>&#160;</div><div class="line"><a name="l00034"></a><span class="lineno">   34</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxPhysXConfig_8h.html">PxPhysXConfig.h</a>&quot;</span></div><div class="line"><a name="l00035"></a><span class="lineno">   35</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxVec3_8h.html">foundation/PxVec3.h</a>&quot;</span></div><div class="line"><a name="l00036"></a><span class="lineno">   36</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxMat33_8h.html">foundation/PxMat33.h</a>&quot;</span></div><div class="line"><a name="l00037"></a><span class="lineno">   37</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxTransform_8h.html">foundation/PxTransform.h</a>&quot;</span></div><div class="line"><a name="l00038"></a><span class="lineno">   38</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="PxConstraintDesc_8h.html">PxConstraintDesc.h</a>&quot;</span></div><div class="line"><a name="l00039"></a><span class="lineno">   39</span>&#160;<span class="preprocessor">#include &quot;<a class="code" href="GuContactPoint_8h.html">geomutils/GuContactPoint.h</a>&quot;</span></div><div class="line"><a name="l00040"></a><span class="lineno">   40</span>&#160;</div><div class="line"><a name="l00041"></a><span class="lineno">   41</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00042"></a><span class="lineno">   42</span>&#160;<span class="preprocessor">#pragma warning(push)</span></div><div class="line"><a name="l00043"></a><span class="lineno">   43</span>&#160;<span class="preprocessor">#pragma warning(disable : 4324) // structure was padded due to alignment</span></div><div class="line"><a name="l00044"></a><span class="lineno">   44</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00045"></a><span class="lineno">   45</span>&#160;</div><div class="line"><a name="l00046"></a><span class="lineno">   46</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00047"></a><span class="lineno">   47</span>&#160;<span class="keyword">namespace </span><a class="code" href="namespacephysx.html">physx</a></div><div class="line"><a name="l00048"></a><span class="lineno">   48</span>&#160;{</div><div class="line"><a name="l00049"></a><span class="lineno">   49</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00050"></a><span class="lineno">   50</span>&#160;</div><div class="line"><a name="l00051"></a><span class="lineno"><a class="line" href="namespaceDy.html">   51</a></span>&#160;<span class="keyword">namespace </span><a class="code" href="namespaceDy.html">Dy</a></div><div class="line"><a name="l00052"></a><span class="lineno">   52</span>&#160;{</div><div class="line"><a name="l00053"></a><span class="lineno">   53</span>&#160;    <span class="comment">//struct FsData;</span></div><div class="line"><a name="l00054"></a><span class="lineno">   54</span>&#160;    <span class="keyword">class </span>ArticulationV;</div><div class="line"><a name="l00055"></a><span class="lineno">   55</span>&#160;}</div><div class="line"><a name="l00056"></a><span class="lineno">   56</span>&#160;</div><div class="line"><a name="l00057"></a><span class="lineno"><a class="line" href="namespaceSc.html">   57</a></span>&#160;<span class="keyword">namespace </span><a class="code" href="namespaceSc.html">Sc</a></div><div class="line"><a name="l00058"></a><span class="lineno">   58</span>&#160;{</div><div class="line"><a name="l00059"></a><span class="lineno">   59</span>&#160;    <span class="keyword">class </span>ShapeInteraction;</div><div class="line"><a name="l00060"></a><span class="lineno">   60</span>&#160;}</div><div class="line"><a name="l00061"></a><span class="lineno">   61</span>&#160;</div><div class="line"><a name="l00062"></a><span class="lineno"><a class="line" href="structPxSolverBody.html">   62</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverBody.html">PxSolverBody</a></div><div class="line"><a name="l00063"></a><span class="lineno">   63</span>&#160;{</div><div class="line"><a name="l00064"></a><span class="lineno">   64</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, <a class="code" href="classPxVec3.html">PxVec3</a>) linearVelocity;                    </div><div class="line"><a name="l00065"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#a94d862d029f4dba6de658d2b84b6affc">   65</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverBody.html#a94d862d029f4dba6de658d2b84b6affc">maxSolverNormalProgress</a>;        </div><div class="line"><a name="l00066"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#a2e3b0fc4b9097cb2ded8aec5120299ec">   66</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverBody.html#a2e3b0fc4b9097cb2ded8aec5120299ec">maxSolverFrictionProgress</a>;      </div><div class="line"><a name="l00067"></a><span class="lineno">   67</span>&#160;</div><div class="line"><a name="l00068"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#ab9c0a75f4b85920be4852c53c4ed2799">   68</a></span>&#160;    <a class="code" href="classPxVec3.html">PxVec3</a>              <a class="code" href="structPxSolverBody.html#ab9c0a75f4b85920be4852c53c4ed2799">angularState</a>;                   </div><div class="line"><a name="l00069"></a><span class="lineno">   69</span>&#160;</div><div class="line"><a name="l00070"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#abe151efeabf33d524e5589fc9897fbd7">   70</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>               <a class="code" href="structPxSolverBody.html#abe151efeabf33d524e5589fc9897fbd7">solverProgress</a>;                 </div><div class="line"><a name="l00071"></a><span class="lineno">   71</span>&#160;</div><div class="line"><a name="l00072"></a><span class="lineno"><a class="line" href="structPxSolverBody.html#afa5845fa664df156d2a25300ea00593e">   72</a></span>&#160;    <a class="code" href="structPxSolverBody.html#afa5845fa664df156d2a25300ea00593e">PxSolverBody</a>() : linearVelocity(0.f), maxSolverNormalProgress(0), maxSolverFrictionProgress(0), angularState(0), solverProgress(0)</div><div class="line"><a name="l00073"></a><span class="lineno">   73</span>&#160;    {</div><div class="line"><a name="l00074"></a><span class="lineno">   74</span>&#160;    }</div><div class="line"><a name="l00075"></a><span class="lineno">   75</span>&#160;};</div><div class="line"><a name="l00076"></a><span class="lineno">   76</span>&#160;</div><div class="line"><a name="l00077"></a><span class="lineno">   77</span>&#160;<a class="code" href="PxSolverDefs_8h.html#a0b777f57ee7373fc869734591a7d1ba5">PX_COMPILE_TIME_ASSERT</a>(<span class="keyword">sizeof</span>(<a class="code" href="structPxSolverBody.html">PxSolverBody</a>) == 32);</div><div class="line"><a name="l00078"></a><span class="lineno">   78</span>&#160;</div><div class="line"><a name="l00079"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html">   79</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverBodyData.html">PxSolverBodyData</a></div><div class="line"><a name="l00080"></a><span class="lineno">   80</span>&#160;{</div><div class="line"><a name="l00081"></a><span class="lineno">   81</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, <a class="code" href="classPxVec3.html">PxVec3</a> linearVelocity);    </div><div class="line"><a name="l00082"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6">   82</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6">invMass</a>;                </div><div class="line"><a name="l00083"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a2ab7bc060f7946ae4f3bb1da17018326">   83</a></span>&#160;    <a class="code" href="classPxVec3.html">PxVec3</a>          <a class="code" href="structPxSolverBodyData.html#a2ab7bc060f7946ae4f3bb1da17018326">angularVelocity</a>;        </div><div class="line"><a name="l00084"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a74e634af20acbf0a37246a5ec44f2248">   84</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#a74e634af20acbf0a37246a5ec44f2248">reportThreshold</a>;        </div><div class="line"><a name="l00085"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#aeaab06a3dfac94c9ccde5e40f966f8af">   85</a></span>&#160;    <a class="code" href="classPxMat33.html">PxMat33</a>         <a class="code" href="structPxSolverBodyData.html#aeaab06a3dfac94c9ccde5e40f966f8af">sqrtInvInertia</a>;         </div><div class="line"><a name="l00086"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2">   86</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2">penBiasClamp</a>;           </div><div class="line"><a name="l00087"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296">   87</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>           <a class="code" href="structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296">nodeIndex</a>;              </div><div class="line"><a name="l00088"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b">   88</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>          <a class="code" href="structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b">maxContactImpulse</a>;      </div><div class="line"><a name="l00089"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a1d32e78a191473936a6693b40185ad91">   89</a></span>&#160;    <a class="code" href="classPxTransform.html">PxTransform</a>     <a class="code" href="structPxSolverBodyData.html#a1d32e78a191473936a6693b40185ad91">body2World</a>;             </div><div class="line"><a name="l00090"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415">   90</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>           <a class="code" href="structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415">lockFlags</a>;              </div><div class="line"><a name="l00091"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7">   91</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>           <a class="code" href="structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7">pad</a>;                    </div><div class="line"><a name="l00092"></a><span class="lineno">   92</span>&#160;</div><div class="line"><a name="l00093"></a><span class="lineno"><a class="line" href="structPxSolverBodyData.html#aa443999488f683690658da520f1726c9">   93</a></span>&#160;    <a class="code" href="group__foundation.html#ga6a774eed3cad34b0f636332a3d28c6bb">PX_FORCE_INLINE</a> <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverBodyData.html#aa443999488f683690658da520f1726c9">projectVelocity</a>(<span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; lin, <span class="keyword">const</span> <a class="code" href="classPxVec3.html">PxVec3</a>&amp; ang)<span class="keyword">    const</span></div><div class="line"><a name="l00094"></a><span class="lineno">   94</span>&#160;<span class="keyword">    </span>{</div><div class="line"><a name="l00095"></a><span class="lineno">   95</span>&#160;        <span class="keywordflow">return</span> linearVelocity.<a class="code" href="classPxVec3.html#a4c136327953ea1b09be10d979279b7dc">dot</a>(lin) + angularVelocity.<a class="code" href="classPxVec3.html#a4c136327953ea1b09be10d979279b7dc">dot</a>(ang);</div><div class="line"><a name="l00096"></a><span class="lineno">   96</span>&#160;    }</div><div class="line"><a name="l00097"></a><span class="lineno">   97</span>&#160;};</div><div class="line"><a name="l00098"></a><span class="lineno">   98</span>&#160;</div><div class="line"><a name="l00099"></a><span class="lineno">   99</span>&#160;<span class="comment">//----------------------------------</span></div><div class="line"><a name="l00100"></a><span class="lineno">  100</span>&#160;<span class="comment">/*</span></div><div class="line"><a name="l00101"></a><span class="lineno">  101</span>&#160;<span class="comment">* A header that defines the size of a specific batch of constraints (of same type and without dependencies)</span></div><div class="line"><a name="l00102"></a><span class="lineno">  102</span>&#160;<span class="comment">*/</span></div><div class="line"><a name="l00103"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html">  103</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxConstraintBatchHeader.html">PxConstraintBatchHeader</a></div><div class="line"><a name="l00104"></a><span class="lineno">  104</span>&#160;{</div><div class="line"><a name="l00105"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html#a4b7fe2ed800f2340292ac8a6229c011d">  105</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxConstraintBatchHeader.html#a4b7fe2ed800f2340292ac8a6229c011d">mStartIndex</a>;          </div><div class="line"><a name="l00106"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html#ad8e475353cb5ee9d05cc5fcb12b2d25e">  106</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxConstraintBatchHeader.html#ad8e475353cb5ee9d05cc5fcb12b2d25e">mStride</a>;              </div><div class="line"><a name="l00107"></a><span class="lineno"><a class="line" href="structPxConstraintBatchHeader.html#af4cb46c92e9051700b4accde28b5c01a">  107</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxConstraintBatchHeader.html#af4cb46c92e9051700b4accde28b5c01a">mConstraintType</a>;      </div><div class="line"><a name="l00108"></a><span class="lineno">  108</span>&#160;};</div><div class="line"><a name="l00109"></a><span class="lineno">  109</span>&#160;</div><div class="line"><a name="l00110"></a><span class="lineno">  110</span>&#160;</div><div class="line"><a name="l00111"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html">  111</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverConstraintDesc.html">PxSolverConstraintDesc</a></div><div class="line"><a name="l00112"></a><span class="lineno">  112</span>&#160;{</div><div class="line"><a name="l00113"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a87d502cd319544604669cb843a239fb0">  113</a></span>&#160;    <span class="keyword">static</span> <span class="keyword">const</span> <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> NO_LINK = 0xffff;</div><div class="line"><a name="l00114"></a><span class="lineno">  114</span>&#160;</div><div class="line"><a name="l00115"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150">  115</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150">ConstraintType</a></div><div class="line"><a name="l00116"></a><span class="lineno">  116</span>&#160;    {</div><div class="line"><a name="l00117"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150af07aa2676bc3a0f445124a369d76178d">  117</a></span>&#160;        <a class="code" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150af07aa2676bc3a0f445124a369d76178d">eCONTACT_CONSTRAINT</a>,                </div><div class="line"><a name="l00118"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aa0a51c38561792f72763a9e214eaa150a964367059d7c0c8b31ee93cd614a1aac">  118</a></span>&#160;        eJOINT_CONSTRAINT                   </div><div class="line"><a name="l00119"></a><span class="lineno">  119</span>&#160;    };</div><div class="line"><a name="l00120"></a><span class="lineno">  120</span>&#160;</div><div class="line"><a name="l00121"></a><span class="lineno">  121</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00122"></a><span class="lineno">  122</span>&#160;    {</div><div class="line"><a name="l00123"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a917f6a5b6829c4c6f8f44182628e4e2e">  123</a></span>&#160;        <a class="code" href="structPxSolverBody.html">PxSolverBody</a>*               <a class="code" href="structPxSolverConstraintDesc.html#a917f6a5b6829c4c6f8f44182628e4e2e">bodyA</a>;          </div><div class="line"><a name="l00124"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a591b8d338b0e9932d67f405542776065">  124</a></span>&#160;        Dy::ArticulationV*          <a class="code" href="structPxSolverConstraintDesc.html#a591b8d338b0e9932d67f405542776065">articulationA</a>;  </div><div class="line"><a name="l00125"></a><span class="lineno">  125</span>&#160;    };</div><div class="line"><a name="l00126"></a><span class="lineno">  126</span>&#160;</div><div class="line"><a name="l00127"></a><span class="lineno">  127</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00128"></a><span class="lineno">  128</span>&#160;    {</div><div class="line"><a name="l00129"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a6144adf5b9396847a54da7d7c17a770f">  129</a></span>&#160;        <a class="code" href="structPxSolverBody.html">PxSolverBody</a>*               <a class="code" href="structPxSolverConstraintDesc.html#a6144adf5b9396847a54da7d7c17a770f">bodyB</a>;          </div><div class="line"><a name="l00130"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a1452b23eaf124e9740f26e4bb98477ca">  130</a></span>&#160;        Dy::ArticulationV*          <a class="code" href="structPxSolverConstraintDesc.html#a1452b23eaf124e9740f26e4bb98477ca">articulationB</a>;  </div><div class="line"><a name="l00131"></a><span class="lineno">  131</span>&#160;    };</div><div class="line"><a name="l00132"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#ab7d352db61fd0a031d9574a4322e629d">  132</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#ab7d352db61fd0a031d9574a4322e629d">linkIndexA</a>;         </div><div class="line"><a name="l00133"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a833220f6cf3fef57bee54a4cfaa81049">  133</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#a833220f6cf3fef57bee54a4cfaa81049">linkIndexB</a>;         </div><div class="line"><a name="l00134"></a><span class="lineno">  134</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00135"></a><span class="lineno">  135</span>&#160;    {</div><div class="line"><a name="l00136"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#abada1c0d78bba24998736ead4bd808fb">  136</a></span>&#160;        <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#abada1c0d78bba24998736ead4bd808fb">articulationALength</a>;    </div><div class="line"><a name="l00137"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a75d8cc26a512bd1e1a1e1a965decfa83">  137</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>               <a class="code" href="structPxSolverConstraintDesc.html#a75d8cc26a512bd1e1a1e1a965decfa83">bodyADataIndex</a>;         </div><div class="line"><a name="l00138"></a><span class="lineno">  138</span>&#160;    };</div><div class="line"><a name="l00139"></a><span class="lineno">  139</span>&#160;</div><div class="line"><a name="l00140"></a><span class="lineno">  140</span>&#160;    <span class="keyword">union</span></div><div class="line"><a name="l00141"></a><span class="lineno">  141</span>&#160;    {</div><div class="line"><a name="l00142"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a911f59c5a7b2dcf4ed5991ec1b6d7153">  142</a></span>&#160;        <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>               <a class="code" href="structPxSolverConstraintDesc.html#a911f59c5a7b2dcf4ed5991ec1b6d7153">articulationBLength</a>;    </div><div class="line"><a name="l00143"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a0e7c10b35dcc40b914857957fa64beb2">  143</a></span>&#160;        <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>               <a class="code" href="structPxSolverConstraintDesc.html#a0e7c10b35dcc40b914857957fa64beb2">bodyBDataIndex</a>;         </div><div class="line"><a name="l00144"></a><span class="lineno">  144</span>&#160;    };</div><div class="line"><a name="l00145"></a><span class="lineno">  145</span>&#160;</div><div class="line"><a name="l00146"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#aaa77283c5efaa8f2b11611881729e8b8">  146</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>                   <a class="code" href="structPxSolverConstraintDesc.html#aaa77283c5efaa8f2b11611881729e8b8">writeBackLengthOver4</a>;   </div><div class="line"><a name="l00147"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a417fed9862da2ced60f98353fea74ace">  147</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a>                   <a class="code" href="structPxSolverConstraintDesc.html#a417fed9862da2ced60f98353fea74ace">constraintLengthOver16</a>; </div><div class="line"><a name="l00148"></a><span class="lineno">  148</span>&#160;</div><div class="line"><a name="l00149"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#a7a577215addd09b9e092aae536157161">  149</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>*                   <a class="code" href="structPxSolverConstraintDesc.html#a7a577215addd09b9e092aae536157161">constraint</a>;             </div><div class="line"><a name="l00150"></a><span class="lineno"><a class="line" href="structPxSolverConstraintDesc.html#ae9b299864d3b98e18fcfc6a089bb7f79">  150</a></span>&#160;    <span class="keywordtype">void</span>*                   <a class="code" href="structPxSolverConstraintDesc.html#ae9b299864d3b98e18fcfc6a089bb7f79">writeBack</a>;              </div><div class="line"><a name="l00151"></a><span class="lineno">  151</span>&#160;};</div><div class="line"><a name="l00152"></a><span class="lineno">  152</span>&#160;</div><div class="line"><a name="l00153"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html">  153</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverConstraintPrepDescBase.html">PxSolverConstraintPrepDescBase</a></div><div class="line"><a name="l00154"></a><span class="lineno">  154</span>&#160;{</div><div class="line"><a name="l00155"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">  155</a></span>&#160;    <span class="keyword">enum</span> <a class="code" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">BodyState</a></div><div class="line"><a name="l00156"></a><span class="lineno">  156</span>&#160;    {</div><div class="line"><a name="l00157"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa6f030c0742671e6fe38c87ff48eba4ca">  157</a></span>&#160;        eDYNAMIC_BODY = 1 &lt;&lt; 0,</div><div class="line"><a name="l00158"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa558ab36462aa417f686f03eabf3db6b3">  158</a></span>&#160;        eSTATIC_BODY = 1 &lt;&lt; 1,</div><div class="line"><a name="l00159"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa60a5b498ccb90a2f703fb277b7784807">  159</a></span>&#160;        eKINEMATIC_BODY = 1 &lt;&lt; 2,</div><div class="line"><a name="l00160"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4fa55d8f5d447451c19b05290583597ce1c">  160</a></span>&#160;        eARTICULATION = 1 &lt;&lt; 3</div><div class="line"><a name="l00161"></a><span class="lineno">  161</span>&#160;    };</div><div class="line"><a name="l00162"></a><span class="lineno">  162</span>&#160;</div><div class="line"><a name="l00163"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a3ad8129bcfce0a8891ed853a3896095d">  163</a></span>&#160;    <a class="code" href="structPxConstraintInvMassScale.html">PxConstraintInvMassScale</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a3ad8129bcfce0a8891ed853a3896095d">mInvMassScales</a>;    </div><div class="line"><a name="l00164"></a><span class="lineno">  164</span>&#160;</div><div class="line"><a name="l00165"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#ae6a7c6dbbf6ade521ca9f485fa70ecab">  165</a></span>&#160;    <a class="code" href="structPxSolverConstraintDesc.html">PxSolverConstraintDesc</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#ae6a7c6dbbf6ade521ca9f485fa70ecab">desc</a>;               </div><div class="line"><a name="l00166"></a><span class="lineno">  166</span>&#160;</div><div class="line"><a name="l00167"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a58839e53ce16cf2d9ec3a54ead955870">  167</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBody.html">PxSolverBody</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#a58839e53ce16cf2d9ec3a54ead955870">body0</a>;                  </div><div class="line"><a name="l00168"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a6cc18f29a8da2a1e8d6af3e9fdce942c">  168</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBody.html">PxSolverBody</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#a6cc18f29a8da2a1e8d6af3e9fdce942c">body1</a>;                  </div><div class="line"><a name="l00169"></a><span class="lineno">  169</span>&#160;</div><div class="line"><a name="l00170"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#ad58b181d4ac418941bddfd3ce8065a10">  170</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBodyData.html">PxSolverBodyData</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#ad58b181d4ac418941bddfd3ce8065a10">data0</a>;              </div><div class="line"><a name="l00171"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#aeeffe63a1a4b9f7b3a822b0f2d99a62d">  171</a></span>&#160;    <span class="keyword">const</span> <a class="code" href="structPxSolverBodyData.html">PxSolverBodyData</a>* <a class="code" href="structPxSolverConstraintPrepDescBase.html#aeeffe63a1a4b9f7b3a822b0f2d99a62d">data1</a>;              </div><div class="line"><a name="l00172"></a><span class="lineno">  172</span>&#160;</div><div class="line"><a name="l00173"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a251d9872155e952dcdd7dec21b9471a3">  173</a></span>&#160;    <a class="code" href="classPxTransform.html">PxTransform</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a251d9872155e952dcdd7dec21b9471a3">bodyFrame0</a>;                     </div><div class="line"><a name="l00174"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a043bdbd0966b02478725edcb027a9819">  174</a></span>&#160;    <a class="code" href="classPxTransform.html">PxTransform</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a043bdbd0966b02478725edcb027a9819">bodyFrame1</a>;                     </div><div class="line"><a name="l00175"></a><span class="lineno">  175</span>&#160;</div><div class="line"><a name="l00176"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a4173f8c3d6662232720fdd99c9f65b9e">  176</a></span>&#160;    <a class="code" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">BodyState</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a4173f8c3d6662232720fdd99c9f65b9e">bodyState0</a>;                       </div><div class="line"><a name="l00177"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDescBase.html#a6f2321b7e28076c3b14e30c0c9f9bdf3">  177</a></span>&#160;    <a class="code" href="structPxSolverConstraintPrepDescBase.html#afadce7bc6a74c62a91c1876aa34e9a4f">BodyState</a> <a class="code" href="structPxSolverConstraintPrepDescBase.html#a6f2321b7e28076c3b14e30c0c9f9bdf3">bodyState1</a>;                       </div><div class="line"><a name="l00178"></a><span class="lineno">  178</span>&#160;</div><div class="line"><a name="l00179"></a><span class="lineno">  179</span>&#160;};</div><div class="line"><a name="l00180"></a><span class="lineno">  180</span>&#160;</div><div class="line"><a name="l00181"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html">  181</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverConstraintPrepDesc.html">PxSolverConstraintPrepDesc</a> : <span class="keyword">public</span> <a class="code" href="structPxSolverConstraintPrepDescBase.html">PxSolverConstraintPrepDescBase</a></div><div class="line"><a name="l00182"></a><span class="lineno">  182</span>&#160;{</div><div class="line"><a name="l00183"></a><span class="lineno">  183</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, <a class="code" href="structPx1DConstraint.html">Px1DConstraint</a>* rows);         </div><div class="line"><a name="l00184"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a70cd18a2d279f25baab00ef7838dec16">  184</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#a70cd18a2d279f25baab00ef7838dec16">numRows</a>;                              </div><div class="line"><a name="l00185"></a><span class="lineno">  185</span>&#160;</div><div class="line"><a name="l00186"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#afb5f4b0d0b54dffbe066d8f9471474a3">  186</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#afb5f4b0d0b54dffbe066d8f9471474a3">linBreakForce</a>, angBreakForce;        </div><div class="line"><a name="l00187"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a3988b4c1ee2db31ae8fb427bef2021c1">  187</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#a3988b4c1ee2db31ae8fb427bef2021c1">minResponseThreshold</a>;                </div><div class="line"><a name="l00188"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#afcacb0bfb6ee3e939bade4d49e5ef4b0">  188</a></span>&#160;    <span class="keywordtype">void</span>* <a class="code" href="structPxSolverConstraintPrepDesc.html#afcacb0bfb6ee3e939bade4d49e5ef4b0">writeback</a>;                            </div><div class="line"><a name="l00189"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a7577dd31e280ee90740c321d452f88a9">  189</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#a7577dd31e280ee90740c321d452f88a9">disablePreprocessing</a>;                  </div><div class="line"><a name="l00190"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a3b7d111854e74252e892765ac4a3b3ad">  190</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#a3b7d111854e74252e892765ac4a3b3ad">improvedSlerp</a>;                         </div><div class="line"><a name="l00191"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#addb6a6a2962b8ce47ff7bf53db6f7759">  191</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#addb6a6a2962b8ce47ff7bf53db6f7759">driveLimitsAreForces</a>;                  </div><div class="line"><a name="l00192"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a7dd4e0d49745651ea820b9ab93ff12c5">  192</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverConstraintPrepDesc.html#a7dd4e0d49745651ea820b9ab93ff12c5">extendedLimits</a>;                        </div><div class="line"><a name="l00193"></a><span class="lineno">  193</span>&#160;</div><div class="line"><a name="l00194"></a><span class="lineno"><a class="line" href="structPxSolverConstraintPrepDesc.html#a3a5cece8fd2221a2d154c2ca2398cf48">  194</a></span>&#160;    <a class="code" href="classPxVec3.html">PxVec3</a> <a class="code" href="structPxSolverConstraintPrepDesc.html#a3a5cece8fd2221a2d154c2ca2398cf48">body0WorldOffset</a>;                    </div><div class="line"><a name="l00195"></a><span class="lineno">  195</span>&#160;};</div><div class="line"><a name="l00196"></a><span class="lineno">  196</span>&#160;</div><div class="line"><a name="l00197"></a><span class="lineno">  197</span>&#160;</div><div class="line"><a name="l00198"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html">  198</a></span>&#160;<span class="keyword">struct </span><a class="code" href="structPxSolverContactDesc.html">PxSolverContactDesc</a> : <span class="keyword">public</span> <a class="code" href="structPxSolverConstraintPrepDescBase.html">PxSolverConstraintPrepDescBase</a></div><div class="line"><a name="l00199"></a><span class="lineno">  199</span>&#160;{</div><div class="line"><a name="l00200"></a><span class="lineno">  200</span>&#160;</div><div class="line"><a name="l00201"></a><span class="lineno">  201</span>&#160;    <a class="code" href="group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a">PX_ALIGN</a>(16, Sc::ShapeInteraction* shapeInteraction); </div><div class="line"><a name="l00202"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ac98be485f0d333ed55a1dae735286964">  202</a></span>&#160;    Gu::ContactPoint* <a class="code" href="structPxSolverContactDesc.html#ac98be485f0d333ed55a1dae735286964">contacts</a>;             </div><div class="line"><a name="l00203"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ae2a5692ebc3c99c68ac34828d15ef56d">  203</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#ae2a5692ebc3c99c68ac34828d15ef56d">numContacts</a>;                      </div><div class="line"><a name="l00204"></a><span class="lineno">  204</span>&#160;</div><div class="line"><a name="l00205"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ac19bc948958e2f2de3ca214afad28b11">  205</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverContactDesc.html#ac19bc948958e2f2de3ca214afad28b11">hasMaxImpulse</a>;                     </div><div class="line"><a name="l00206"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a7d0a19a4752992c482bc7c5d9e315927">  206</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverContactDesc.html#a7d0a19a4752992c482bc7c5d9e315927">disableStrongFriction</a>;             </div><div class="line"><a name="l00207"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ae2e4ab03c648da3de0754b5bb20470af">  207</a></span>&#160;    <span class="keywordtype">bool</span> <a class="code" href="structPxSolverContactDesc.html#ae2e4ab03c648da3de0754b5bb20470af">hasForceThresholds</a>;                </div><div class="line"><a name="l00208"></a><span class="lineno">  208</span>&#160;</div><div class="line"><a name="l00209"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a1b01af74e18c8ce096fdd19b29fe9937">  209</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverContactDesc.html#a1b01af74e18c8ce096fdd19b29fe9937">restDistance</a>;                    </div><div class="line"><a name="l00210"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a6dc38af5607703cc2fde5736f7e0ec66">  210</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a> <a class="code" href="structPxSolverContactDesc.html#a6dc38af5607703cc2fde5736f7e0ec66">maxCCDSeparation</a>;                </div><div class="line"><a name="l00211"></a><span class="lineno">  211</span>&#160;</div><div class="line"><a name="l00212"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#aecb81b31ea78f95b634eecdc4ff787e9">  212</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>* <a class="code" href="structPxSolverContactDesc.html#aecb81b31ea78f95b634eecdc4ff787e9">frictionPtr</a>;                      </div><div class="line"><a name="l00213"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a26750cbe3e1f0b674678aa8354f46660">  213</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a> <a class="code" href="structPxSolverContactDesc.html#a26750cbe3e1f0b674678aa8354f46660">frictionCount</a>;                     </div><div class="line"><a name="l00214"></a><span class="lineno">  214</span>&#160;</div><div class="line"><a name="l00215"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a771ea901376db925ed46a6b59d3f9660">  215</a></span>&#160;    <a class="code" href="namespacephysx.html#a727d2d8426e2a21ebbc522fa65c3f97a">PxReal</a>* <a class="code" href="structPxSolverContactDesc.html#a771ea901376db925ed46a6b59d3f9660">contactForces</a>;                  </div><div class="line"><a name="l00216"></a><span class="lineno">  216</span>&#160;</div><div class="line"><a name="l00217"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#afaeb10971f33613461d2c6cf1c2804fa">  217</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#afaeb10971f33613461d2c6cf1c2804fa">startFrictionPatchIndex</a>;          </div><div class="line"><a name="l00218"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#ada1b86bf21e1dd614e7e4a07a9c990dd">  218</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#ada1b86bf21e1dd614e7e4a07a9c990dd">numFrictionPatches</a>;               </div><div class="line"><a name="l00219"></a><span class="lineno">  219</span>&#160;</div><div class="line"><a name="l00220"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a79971a484d887112079ba3f980e9b0e0">  220</a></span>&#160;    <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> <a class="code" href="structPxSolverContactDesc.html#a79971a484d887112079ba3f980e9b0e0">startContactPatchIndex</a>;           </div><div class="line"><a name="l00221"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a2261ea42204894094ada177641222c9a">  221</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxSolverContactDesc.html#a2261ea42204894094ada177641222c9a">numContactPatches</a>;                </div><div class="line"><a name="l00222"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a5b54a6acfb85192b99f71bc9a72d2c1a">  222</a></span>&#160;    <a class="code" href="namespacephysx.html#a9caf1cbcda071b6d2a9c069faa99da23">PxU16</a> <a class="code" href="structPxSolverContactDesc.html#a5b54a6acfb85192b99f71bc9a72d2c1a">axisConstraintCount</a>;              </div><div class="line"><a name="l00223"></a><span class="lineno">  223</span>&#160;</div><div class="line"><a name="l00224"></a><span class="lineno"><a class="line" href="structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be">  224</a></span>&#160;    <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a> pad[16 - <span class="keyword">sizeof</span>(<span class="keywordtype">void</span>*)];</div><div class="line"><a name="l00225"></a><span class="lineno">  225</span>&#160;};</div><div class="line"><a name="l00226"></a><span class="lineno">  226</span>&#160;</div><div class="line"><a name="l00227"></a><span class="lineno"><a class="line" href="classPxConstraintAllocator.html">  227</a></span>&#160;<span class="keyword">class </span><a class="code" href="classPxConstraintAllocator.html">PxConstraintAllocator</a></div><div class="line"><a name="l00228"></a><span class="lineno">  228</span>&#160;{</div><div class="line"><a name="l00229"></a><span class="lineno">  229</span>&#160;<span class="keyword">public</span>:</div><div class="line"><a name="l00235"></a><span class="lineno">  235</span>&#160;    <span class="keyword">virtual</span> <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>* reserveConstraintData(<span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> byteSize) = 0;</div><div class="line"><a name="l00242"></a><span class="lineno">  242</span>&#160;    <span class="keyword">virtual</span> <a class="code" href="namespacephysx.html#a3849f86abc21d3a58949481603fe8309">PxU8</a>* reserveFrictionData(<span class="keyword">const</span> <a class="code" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a> byteSize) = 0;</div><div class="line"><a name="l00243"></a><span class="lineno">  243</span>&#160;</div><div class="line"><a name="l00244"></a><span class="lineno"><a class="line" href="classPxConstraintAllocator.html#a28446a3d69923bc834d8c80938ff18be">  244</a></span>&#160;    <span class="keyword">virtual</span> <a class="code" href="classPxConstraintAllocator.html#a28446a3d69923bc834d8c80938ff18be">~PxConstraintAllocator</a>() {}</div><div class="line"><a name="l00245"></a><span class="lineno">  245</span>&#160;};</div><div class="line"><a name="l00246"></a><span class="lineno">  246</span>&#160;</div><div class="line"><a name="l00247"></a><span class="lineno">  247</span>&#160;<span class="preprocessor">#if !PX_DOXYGEN</span></div><div class="line"><a name="l00248"></a><span class="lineno">  248</span>&#160;}</div><div class="line"><a name="l00249"></a><span class="lineno">  249</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00250"></a><span class="lineno">  250</span>&#160;</div><div class="line"><a name="l00251"></a><span class="lineno">  251</span>&#160;<span class="preprocessor">#if PX_VC</span></div><div class="line"><a name="l00252"></a><span class="lineno">  252</span>&#160;<span class="preprocessor">#pragma warning(pop)</span></div><div class="line"><a name="l00253"></a><span class="lineno">  253</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00254"></a><span class="lineno">  254</span>&#160;</div><div class="line"><a name="l00255"></a><span class="lineno">  255</span>&#160;<span class="preprocessor">#endif</span></div><div class="line"><a name="l00256"></a><span class="lineno">  256</span>&#160;</div><div class="ttc" id="namespacephysx_html"><div class="ttname"><a href="namespacephysx.html">physx</a></div><div class="ttdef"><b>Definition:</b> GuContactBuffer.h:37</div></div>
 <div class="ttc" id="structPxSolverBody_html_ab9c0a75f4b85920be4852c53c4ed2799"><div class="ttname"><a href="structPxSolverBody.html#ab9c0a75f4b85920be4852c53c4ed2799">PxSolverBody::angularState</a></div><div class="ttdeci">PxVec3 angularState</div><div class="ttdoc">Delta angular velocity state computed by the solver. </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:68</div></div>
+<div class="ttc" id="structPxSolverBodyData_html_aa406db65fc9a3a26764ffb91dbbad1b7"><div class="ttname"><a href="structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7">PxSolverBodyData::pad</a></div><div class="ttdeci">PxU16 pad</div><div class="ttdoc">112 pad </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:91</div></div>
 <div class="ttc" id="structPxSolverConstraintPrepDescBase_html_a4173f8c3d6662232720fdd99c9f65b9e"><div class="ttname"><a href="structPxSolverConstraintPrepDescBase.html#a4173f8c3d6662232720fdd99c9f65b9e">PxSolverConstraintPrepDescBase::bodyState0</a></div><div class="ttdeci">BodyState bodyState0</div><div class="ttdoc">In: Defines what kind of actor the first body is. </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:176</div></div>
 <div class="ttc" id="structPxConstraintBatchHeader_html_ad8e475353cb5ee9d05cc5fcb12b2d25e"><div class="ttname"><a href="structPxConstraintBatchHeader.html#ad8e475353cb5ee9d05cc5fcb12b2d25e">PxConstraintBatchHeader::mStride</a></div><div class="ttdeci">PxU16 mStride</div><div class="ttdoc">Number of constraints in this batch (range: 1-4) </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:106</div></div>
 <div class="ttc" id="structPxSolverConstraintPrepDesc_html_afcacb0bfb6ee3e939bade4d49e5ef4b0"><div class="ttname"><a href="structPxSolverConstraintPrepDesc.html#afcacb0bfb6ee3e939bade4d49e5ef4b0">PxSolverConstraintPrepDesc::writeback</a></div><div class="ttdeci">void * writeback</div><div class="ttdoc">Pointer to constraint writeback structure. Reports back joint breaking. If not required, set to NULL. </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:188</div></div>
@@ -181,7 +182,6 @@
 <div class="ttc" id="structPxSolverContactDesc_html_a771ea901376db925ed46a6b59d3f9660"><div class="ttname"><a href="structPxSolverContactDesc.html#a771ea901376db925ed46a6b59d3f9660">PxSolverContactDesc::contactForces</a></div><div class="ttdeci">PxReal * contactForces</div><div class="ttdoc">Out: A buffer for the solver to write applied contact forces to. </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:215</div></div>
 <div class="ttc" id="structPxSolverConstraintPrepDescBase_html_ad58b181d4ac418941bddfd3ce8065a10"><div class="ttname"><a href="structPxSolverConstraintPrepDescBase.html#ad58b181d4ac418941bddfd3ce8065a10">PxSolverConstraintPrepDescBase::data0</a></div><div class="ttdeci">const PxSolverBodyData * data0</div><div class="ttdoc">In: The first PxSolverBodyData. Stores mass and miscellaneous information for the first body...</div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:170</div></div>
 <div class="ttc" id="structPxSolverConstraintDesc_html_a75d8cc26a512bd1e1a1e1a965decfa83"><div class="ttname"><a href="structPxSolverConstraintDesc.html#a75d8cc26a512bd1e1a1e1a965decfa83">PxSolverConstraintDesc::bodyADataIndex</a></div><div class="ttdeci">PxU32 bodyADataIndex</div><div class="ttdoc">Body A&amp;#39;s index into the SolverBodyData array. </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:137</div></div>
-<div class="ttc" id="structPxSolverBodyData_html_a92113241e1bc8ad1ebffafcd0873410f"><div class="ttname"><a href="structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f">PxSolverBodyData::hasSurfaceVelocity</a></div><div class="ttdeci">PxU16 hasSurfaceVelocity</div><div class="ttdoc">112 surfaceVelocity </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:91</div></div>
 <div class="ttc" id="classPxVec3_html"><div class="ttname"><a href="classPxVec3.html">PxVec3</a></div><div class="ttdoc">3 Element vector class. </div><div class="ttdef"><b>Definition:</b> PxVec3.h:49</div></div>
 <div class="ttc" id="structPxSolverConstraintDesc_html_a1452b23eaf124e9740f26e4bb98477ca"><div class="ttname"><a href="structPxSolverConstraintDesc.html#a1452b23eaf124e9740f26e4bb98477ca">PxSolverConstraintDesc::articulationB</a></div><div class="ttdeci">Dy::ArticulationV * articulationB</div><div class="ttdoc">Articulation pointer for body B. </div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:130</div></div>
 <div class="ttc" id="structPxSolverConstraintDesc_html_ae9b299864d3b98e18fcfc6a089bb7f79"><div class="ttname"><a href="structPxSolverConstraintDesc.html#ae9b299864d3b98e18fcfc6a089bb7f79">PxSolverConstraintDesc::writeBack</a></div><div class="ttdeci">void * writeBack</div><div class="ttdoc">Pointer to the writeback structure results for this given constraint are to be written to...</div><div class="ttdef"><b>Definition:</b> PxSolverDefs.h:150</div></div>
diff --git a/physx/documentation/PhysXAPI/files/PxTypeInfo_8h_source.html b/physx/documentation/PhysXAPI/files/PxTypeInfo_8h_source.html
index af538112e..67f38dbad 100644
--- a/physx/documentation/PhysXAPI/files/PxTypeInfo_8h_source.html
+++ b/physx/documentation/PhysXAPI/files/PxTypeInfo_8h_source.html
@@ -93,7 +93,7 @@
 <div class="ttc" id="structPxConcreteType_html_ad762aca1d4c8fffae41267fae4f1b8e8ab6585d569ee421acfeadc5d89adf3bc3"><div class="ttname"><a href="structPxConcreteType.html#ad762aca1d4c8fffae41267fae4f1b8e8ab6585d569ee421acfeadc5d89adf3bc3">PxConcreteType::eSHAPE</a></div><div class="ttdef"><b>Definition:</b> PxTypeInfo.h:66</div></div>
 <div class="ttc" id="PxPhysXCommonConfig_8h_html"><div class="ttname"><a href="PxPhysXCommonConfig_8h.html">PxPhysXCommonConfig.h</a></div></div>
 <div class="ttc" id="classPxRigidDynamic_html"><div class="ttname"><a href="classPxRigidDynamic.html">PxRigidDynamic</a></div><div class="ttdoc">PxRigidDynamic represents a dynamic rigid simulation object in the physics SDK. </div><div class="ttdef"><b>Definition:</b> PxRigidDynamic.h:83</div></div>
-<div class="ttc" id="classPxArticulationJoint_html"><div class="ttname"><a href="classPxArticulationJoint.html">PxArticulationJoint</a></div><div class="ttdoc">a joint between two links in an articulation. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:195</div></div>
+<div class="ttc" id="classPxArticulationJoint_html"><div class="ttname"><a href="classPxArticulationJoint.html">PxArticulationJoint</a></div><div class="ttdoc">a joint between two links in an articulation. </div><div class="ttdef"><b>Definition:</b> PxArticulationJoint.h:193</div></div>
 <div class="ttc" id="structPxConcreteType_html_ad762aca1d4c8fffae41267fae4f1b8e8a0ff66b28d98136906e07b15b396d7a1f"><div class="ttname"><a href="structPxConcreteType.html#ad762aca1d4c8fffae41267fae4f1b8e8a0ff66b28d98136906e07b15b396d7a1f">PxConcreteType::ePHYSX_CORE_COUNT</a></div><div class="ttdef"><b>Definition:</b> PxTypeInfo.h:76</div></div>
 <div class="ttc" id="classPxActor_html"><div class="ttname"><a href="classPxActor.html">PxActor</a></div><div class="ttdoc">PxActor is the base class for the main simulation objects in the physics SDK. </div><div class="ttdef"><b>Definition:</b> PxActor.h:154</div></div>
 <div class="ttc" id="structPxConcreteType_html_ad762aca1d4c8fffae41267fae4f1b8e8a59470211a58ba63a20540005c76fd767"><div class="ttname"><a href="structPxConcreteType.html#ad762aca1d4c8fffae41267fae4f1b8e8a59470211a58ba63a20540005c76fd767">PxConcreteType::eTRIANGLE_MESH_BVH34</a></div><div class="ttdef"><b>Definition:</b> PxTypeInfo.h:62</div></div>
diff --git a/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate-members.html b/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate-members.html
index 1f104a46b..44cfc1c56 100644
--- a/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate-members.html
+++ b/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate-members.html
@@ -149,23 +149,24 @@
   <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#aaaa9cfef0b929c0bbe4b6fa8811249fd">releaseArticulationJoint</a>(PxArticulationJointBase *joint)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#ae3b3c188ffd64c4ba7cfd7508efd63bf">releaseCache</a>(PxArticulationCache &amp;cache) const =0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
   <tr><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a024c2e5000ad1033a868a73b6a025c03">removeLoopJoint</a>(PxJoint *joint)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564">setArticulationFlags</a>(PxArticulationFlags flags)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#a1dd20bb574075df904e1878f2e771ef9">setBaseFlag</a>(PxBaseFlag::Enum flag, bool value)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#afb391b692c561df0a54c413785e4ebd0">setBaseFlags</a>(PxBaseFlags inFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#a1c5af081a9ec2c7f1e2a828d3a969900">setName</a>(const char *name)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#aae9b8704f26ec7e60582774d9a38ef6b">setSleepThreshold</a>(PxReal threshold)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#a8059735ba075c0839c4ebfeab34906dc">setSolverIterationCounts</a>(PxU32 minPositionIters, PxU32 minVelocityIters=1)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#a1cf09f7f818b312205a29fb8e7f39f7f">setStabilizationThreshold</a>(PxReal threshold)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#a8a556ddf37be3668fa9d1202db6ae48e">setWakeCounter</a>(PxReal wakeCounterValue)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a2d067a41dc5bcbbdbf9a298d9064b310">teleportRootLink</a>(const PxTransform &amp;pose, bool autowake)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#a3782ca64c8f12c41443f604e300fc207">typeMatch</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a10d36a5ac6bb414b2240774cabdc4695">unpackJointData</a>(const PxReal *reduced, PxReal *maximum) const =0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#ae8ee210b8b3867a8a0803875c6e53014">userData</a></td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#a236592b36cced8478b3e4385c54007af">wakeUp</a>()=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a919526ace70d5aaa4fb34ad0fcc98fd0">zeroCache</a>(PxArticulationCache &amp;cache)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#a6fe7170f2077b283b7da5dfb23b25ba0">~PxArticulationBase</a>()</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">~PxArticulationReducedCoordinate</a>()</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#adfbf62dc32775b068db1c74c7fd3d982">~PxBase</a>()</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4">setArticulationFlag</a>(PxArticulationFlag::Enum flag, bool value)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564">setArticulationFlags</a>(PxArticulationFlags flags)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a1dd20bb574075df904e1878f2e771ef9">setBaseFlag</a>(PxBaseFlag::Enum flag, bool value)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxBase.html#afb391b692c561df0a54c413785e4ebd0">setBaseFlags</a>(PxBaseFlags inFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#a1c5af081a9ec2c7f1e2a828d3a969900">setName</a>(const char *name)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#aae9b8704f26ec7e60582774d9a38ef6b">setSleepThreshold</a>(PxReal threshold)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#a8059735ba075c0839c4ebfeab34906dc">setSolverIterationCounts</a>(PxU32 minPositionIters, PxU32 minVelocityIters=1)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#a1cf09f7f818b312205a29fb8e7f39f7f">setStabilizationThreshold</a>(PxReal threshold)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#a8a556ddf37be3668fa9d1202db6ae48e">setWakeCounter</a>(PxReal wakeCounterValue)=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a2d067a41dc5bcbbdbf9a298d9064b310">teleportRootLink</a>(const PxTransform &amp;pose, bool autowake)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a3782ca64c8f12c41443f604e300fc207">typeMatch</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a10d36a5ac6bb414b2240774cabdc4695">unpackJointData</a>(const PxReal *reduced, PxReal *maximum) const =0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationBase.html#ae8ee210b8b3867a8a0803875c6e53014">userData</a></td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#a236592b36cced8478b3e4385c54007af">wakeUp</a>()=0</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a919526ace70d5aaa4fb34ad0fcc98fd0">zeroCache</a>(PxArticulationCache &amp;cache)=0</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxArticulationBase.html#a6fe7170f2077b283b7da5dfb23b25ba0">~PxArticulationBase</a>()</td><td class="entry"><a class="el" href="classPxArticulationBase.html">PxArticulationBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a">~PxArticulationReducedCoordinate</a>()</td><td class="entry"><a class="el" href="classPxArticulationReducedCoordinate.html">PxArticulationReducedCoordinate</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxBase.html#adfbf62dc32775b068db1c74c7fd3d982">~PxBase</a>()</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">virtual</span></td></tr>
 </table></div><!-- contents -->
 </div><!-- doc-content -->
 <!-- HTML footer for doxygen 1.8.14-->
diff --git a/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.html b/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.html
index 27c42bad2..a4eebb9b1 100644
--- a/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.html
+++ b/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.html
@@ -116,8 +116,11 @@
 <tr class="memdesc:a15d00fc95ae61c2906977453e7597d9b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Releases the <a class="el" href="classPxBase.html" title="Base class for objects that can be members of a PxCollection. ">PxBase</a> instance, please check documentation of release in derived class.  <a href="#a15d00fc95ae61c2906977453e7597d9b">More...</a><br /></td></tr>
 <tr class="separator:a15d00fc95ae61c2906977453e7597d9b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ab155a21325dd588b7e354d3a5b1ef564"><td class="memItemLeft" align="right" valign="top">virtual void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564">setArticulationFlags</a> (<a class="el" href="group__physics.html#ga40a321fce7bcb9669d32fca769d2958a">PxArticulationFlags</a> <a class="el" href="PxConstraintDesc_8h.html#acfdd81caa30ceb0af5fafb4064b1bc67">flags</a>)=0</td></tr>
-<tr class="memdesc:ab155a21325dd588b7e354d3a5b1ef564"><td class="mdescLeft">&#160;</td><td class="mdescRight">copy the internal data of the articulation to the cache  <a href="#ab155a21325dd588b7e354d3a5b1ef564">More...</a><br /></td></tr>
+<tr class="memdesc:ab155a21325dd588b7e354d3a5b1ef564"><td class="mdescLeft">&#160;</td><td class="mdescRight">Sets flags on the articulation.  <a href="#ab155a21325dd588b7e354d3a5b1ef564">More...</a><br /></td></tr>
 <tr class="separator:ab155a21325dd588b7e354d3a5b1ef564"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aea7152a77f9446960d65359e482595c4"><td class="memItemLeft" align="right" valign="top">virtual void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4">setArticulationFlag</a> (<a class="el" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">PxArticulationFlag::Enum</a> flag, bool value)=0</td></tr>
+<tr class="memdesc:aea7152a77f9446960d65359e482595c4"><td class="mdescLeft">&#160;</td><td class="mdescRight">Raises or clears a flag on the articulation.  <a href="#aea7152a77f9446960d65359e482595c4">More...</a><br /></td></tr>
+<tr class="separator:aea7152a77f9446960d65359e482595c4"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a3e8a26a6e87d34c31441d766b0d6a153"><td class="memItemLeft" align="right" valign="top">virtual <a class="el" href="group__physics.html#ga40a321fce7bcb9669d32fca769d2958a">PxArticulationFlags</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxArticulationReducedCoordinate.html#a3e8a26a6e87d34c31441d766b0d6a153">getArticulationFlags</a> () const =0</td></tr>
 <tr class="memdesc:a3e8a26a6e87d34c31441d766b0d6a153"><td class="mdescLeft">&#160;</td><td class="mdescRight">return PxArticulationFlags  <a href="#a3e8a26a6e87d34c31441d766b0d6a153">More...</a><br /></td></tr>
 <tr class="separator:a3e8a26a6e87d34c31441d766b0d6a153"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -128,6 +131,7 @@
 <tr class="memdesc:a99f7f83876a3414eb386c83e173f9364"><td class="mdescLeft">&#160;</td><td class="mdescRight">create an articulation cache  <a href="#a99f7f83876a3414eb386c83e173f9364">More...</a><br /></td></tr>
 <tr class="separator:a99f7f83876a3414eb386c83e173f9364"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a86107c6d84d3132b6a46bb9b2b48666e"><td class="memItemLeft" align="right" valign="top">virtual <a class="el" href="group__foundation.html#gacce5749db3dcfb916e98c253374264ed">PxU32</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxArticulationReducedCoordinate.html#a86107c6d84d3132b6a46bb9b2b48666e">getCacheDataSize</a> () const =0</td></tr>
+<tr class="memdesc:a86107c6d84d3132b6a46bb9b2b48666e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Get the size of the articulation cache.  <a href="#a86107c6d84d3132b6a46bb9b2b48666e">More...</a><br /></td></tr>
 <tr class="separator:a86107c6d84d3132b6a46bb9b2b48666e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a919526ace70d5aaa4fb34ad0fcc98fd0"><td class="memItemLeft" align="right" valign="top">virtual void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxArticulationReducedCoordinate.html#a919526ace70d5aaa4fb34ad0fcc98fd0">zeroCache</a> (<a class="el" href="classPxArticulationCache.html">PxArticulationCache</a> &amp;cache)=0</td></tr>
 <tr class="memdesc:a919526ace70d5aaa4fb34ad0fcc98fd0"><td class="mdescLeft">&#160;</td><td class="mdescRight">zero all data in the articulation cache beside the cache version  <a href="#a919526ace70d5aaa4fb34ad0fcc98fd0">More...</a><br /></td></tr>
@@ -1021,6 +1025,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#a86107c6d84d3132b6a46bb9b
 </table>
 </div><div class="memdoc">
 
+<p>Get the size of the articulation cache. </p>
+<dl class="section note"><dt>Note</dt><dd>this call may only be made on articulations that are in a scene, and may not be made during simulation </dd></dl>
+
 </div>
 </div>
 <a id="a96a853b57a61275d4c67828f34e3321d"></a>
@@ -1302,6 +1309,51 @@ <h2 class="memtitle"><span class="permalink"><a href="#a024c2e5000ad1033a868a73b
 </dl>
 <dl class="section see"><dt>See also</dt><dd><a class="el" href="classPxArticulationReducedCoordinate.html#abe5b98d8b80aa4781ad1d5e9baffad25" title="initialize all the common data for inverse dynamic ">commonInit</a> </dd></dl>
 
+</div>
+</div>
+<a id="aea7152a77f9446960d65359e482595c4"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aea7152a77f9446960d65359e482595c4">&#9670;&nbsp;</a></span>setArticulationFlag()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">virtual void PxArticulationReducedCoordinate::setArticulationFlag </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609">PxArticulationFlag::Enum</a>&#160;</td>
+          <td class="paramname"><em>flag</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">bool&#160;</td>
+          <td class="paramname"><em>value</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">pure virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Raises or clears a flag on the articulation. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramdir">[in]</td><td class="paramname">flag</td><td>The articulation flag </td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">value</td><td>true/false indicating whether to raise or clear the flag </td></tr>
+  </table>
+  </dd>
+</dl>
+
 </div>
 </div>
 <a id="ab155a21325dd588b7e354d3a5b1ef564"></a>
@@ -1328,14 +1380,13 @@ <h2 class="memtitle"><span class="permalink"><a href="#ab155a21325dd588b7e354d3a
 </table>
 </div><div class="memdoc">
 
-<p>copy the internal data of the articulation to the cache </p>
+<p>Sets flags on the articulation. </p>
 <dl class="params"><dt>Parameters</dt><dd>
   <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">flags</td><td>this indicate whether the root link is fixed</td></tr>
+    <tr><td class="paramdir">[in]</td><td class="paramname">flags</td><td>Articulation flags </td></tr>
   </table>
   </dd>
 </dl>
-<dl class="section see"><dt>See also</dt><dd><a class="el" href="classPxArticulationReducedCoordinate.html#a99f7f83876a3414eb386c83e173f9364" title="create an articulation cache ">createCache</a> <a class="el" href="classPxArticulationReducedCoordinate.html#ab14e3d8440c6dc754fa3318a7c91991d" title="apply the user defined data in the cache to the articulation system ">applyCache</a> </dd></dl>
 
 </div>
 </div>
diff --git a/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.js b/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.js
index ac79c2b41..1fd5d9507 100644
--- a/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.js
+++ b/physx/documentation/PhysXAPI/files/classPxArticulationReducedCoordinate.js
@@ -27,6 +27,7 @@ var classPxArticulationReducedCoordinate =
     [ "release", "classPxArticulationReducedCoordinate.html#a15d00fc95ae61c2906977453e7597d9b", null ],
     [ "releaseCache", "classPxArticulationReducedCoordinate.html#ae3b3c188ffd64c4ba7cfd7508efd63bf", null ],
     [ "removeLoopJoint", "classPxArticulationReducedCoordinate.html#a024c2e5000ad1033a868a73b6a025c03", null ],
+    [ "setArticulationFlag", "classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4", null ],
     [ "setArticulationFlags", "classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564", null ],
     [ "teleportRootLink", "classPxArticulationReducedCoordinate.html#a2d067a41dc5bcbbdbf9a298d9064b310", null ],
     [ "unpackJointData", "classPxArticulationReducedCoordinate.html#a10d36a5ac6bb414b2240774cabdc4695", null ],
diff --git a/physx/documentation/PhysXAPI/files/classPxMeshQuery.html b/physx/documentation/PhysXAPI/files/classPxMeshQuery.html
index e8b0c239f..7a0cd709d 100644
--- a/physx/documentation/PhysXAPI/files/classPxMeshQuery.html
+++ b/physx/documentation/PhysXAPI/files/classPxMeshQuery.html
@@ -438,7 +438,9 @@ <h2 class="memtitle"><span class="permalink"><a href="#a8acb2a510cda8105fe1d9f84
   </table>
   </dd>
 </dl>
-<dl class="section note"><dt>Note</dt><dd>This function will flip the triangle normal whenever triGeom.scale.hasNegativeDeterminant() is true.</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>This function will flip the triangle normal whenever triGeom.scale.hasNegativeDeterminant() is true. </dd>
+<dd>
+TriangleIndex is an index used in internal format, which does have an index out of the bounds in last row. To traverse all tri indices in the HF, the following code can be applied: for (PxU32 row = 0; row &lt; (nbRows - 1); row++) { for (PxU32 col = 0; col &lt; (nbCols - 1); col++) { for (PxU32 k = 0; k &lt; 2; k++) { const PxU32 triIndex = 2 * (row*nbCols + col) + k; .... } } } </dd></dl>
 <dl class="section see"><dt>See also</dt><dd><a class="el" href="classPxTriangle.html" title="Triangle class. ">PxTriangle</a> PxTriangleFlags <a class="el" href="group__common.html#ga19403877bf7ce42d7240e4e4c758c016">PxTriangleID</a> <a class="el" href="classPxMeshQuery.html#a461e1f01530df9d99e0762ca20493c2d" title="Find the height field triangles which touch the specified geometry object. ">findOverlapHeightField()</a> </dd></dl>
 
 </div>
diff --git a/physx/documentation/PhysXAPI/files/classPxRigidDynamic-members.html b/physx/documentation/PhysXAPI/files/classPxRigidDynamic-members.html
index 5525c9466..59d6aeaa9 100644
--- a/physx/documentation/PhysXAPI/files/classPxRigidDynamic-members.html
+++ b/physx/documentation/PhysXAPI/files/classPxRigidDynamic-members.html
@@ -93,79 +93,77 @@
   <tr><td class="entry"><a class="el" href="classPxRigidBody.html#acb04ffc816d45afff2d04e93d7446e79">addTorque</a>(const PxVec3 &amp;torque, PxForceMode::Enum mode=PxForceMode::eFORCE, bool autowake=true)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#a022e098ea67bc8ec87f93c2f18a4db6f">attachShape</a>(PxShape &amp;shape)=0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
   <tr><td class="entry"><a class="el" href="classPxRigidBody.html#aefe5174aa160a07d488de2a18ba61f94">clearForce</a>(PxForceMode::Enum mode=PxForceMode::eFORCE)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c">clearKinematicSurfaceVelocity</a>()=0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#ae251132385cc0b082b612d709aa4375d">clearTorque</a>(PxForceMode::Enum mode=PxForceMode::eFORCE)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#a467f9dc1517978b8d517936a475d3c4c">detachShape</a>(PxShape &amp;shape, bool wakeOnLostTouch=true)=0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#afb92fb228de76bbcd42210d060b558d1">getActorFlags</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#aaedfdd3e9acc77afc2fc0ba51d968876">getAggregate</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#aaffbfa748d3a75e79a13dbca92aaa7ff">getAngularDamping</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#af91b92d6fcf47103b148337749aa93e0">getAngularVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#ad9b10efae7733ff9c8b8d3b7d90a9a5f">getBaseFlags</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#aafdbdab1865112b15201aeabb23877b4">getCMassLocalPose</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#ab4e0691a08e49631dc60014d49c7585f">getConcreteType</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">getConcreteTypeName</a>() const</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#ac88777d5457f87247f5625094eb2be57">getConstraints</a>(PxConstraint **userBuffer, PxU32 bufferSize, PxU32 startIndex=0) const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#acc91baffa652090e523df5e6627686ed">getContactReportThreshold</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#a24602fee489b3e797ff0f82613eef755">getDominanceGroup</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#a6eb5e3d590e6087c930f8141d40de722">getGlobalPose</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a238775889dd7a555d700fd8cd65623e9">getInternalIslandNodeIndex</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a4c82bd72a216c0460887cf94184560bd">getInvMass</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a3b6ed86370cfd04958ca7e1c637e310b">getKinematicTarget</a>(PxTransform &amp;target) const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#ad7fb785dde2e8e0954db807c7ec69a26">getLinearDamping</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a04bab22deecb716e2cdd7a64b5cfaee7">getLinearVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a2b1c475ca9cc6aebc168ac58256b7284">getMass</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a063cce94190de44d86a15c1b49dd7f71">getMassSpaceInertiaTensor</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a63830b4cd24cd6e5e9f5bb2d6a1d3dc3">getMassSpaceInvInertiaTensor</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a8052320915132f7d7f72ede4543b64ad">getMaxAngularVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a2f9e9e7946ab265013229612aad4c3c7">getMaxContactImpulse</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a3304d26a39bbdd99dc0ad51d1d99aec4">getMaxDepenetrationVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a00f7fee01b5d357fdeb840ee4322c7a4">getMaxLinearVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#abae787a717c8468dbf67345dde14ba85">getMinCCDAdvanceCoefficient</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a1ae86d992066567c5d6bca5fe2dc656e">getName</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#ad941cfa63351e7ff6095a4afbe2d9172">getNbConstraints</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#ae5c3534de94642c5980d29f4c49bc112">getNbShapes</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#a660828fb83c91f0ada2ca43e06aa2b35">getOwnerClient</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a2ef3c66318db72f2dd2c1b0d20513f18">getRigidBodyFlags</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#aa88e055440d6c1cc846e32dbed017345">getRigidDynamicLockFlags</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a2bf2e48b1960ef8e4fc1a437473bd7a3">getScene</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#a22ef86619d7a6de688b9ef2b0ad7bcec">getShapes</a>(PxShape **userBuffer, PxU32 bufferSize, PxU32 startIndex=0) const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a1a0232d5a392e204e6b133fd2ab0879e">getSleepThreshold</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a2fcdafcbc0dd5a691d502b2f7b03a7d7">getSolverIterationCounts</a>(PxU32 &amp;minPositionIters, PxU32 &amp;minVelocityIters) const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a5f9d98c9bd3b9ab0be7ce1b3ec4a4574">getStabilizationThreshold</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#a76a3489218f85ce9fe3ffb883ba8ce63">getType</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a88dc31fa470c5f37e3e9683534ad83aa">getWakeCounter</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#acab89ce3ee6d27b580063d131a417721">getWorldBounds</a>(float inflation=1.01f) const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a4bc720adef86ea68bfb5845f0e104d23">is</a>()</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#a85aa50ac0e5670a2be64411fea62b5e3">is</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">isKindOf</a>(const char *name) const</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#a38952fcb80e99f1f5dbf774a75ac2c3d">isReleasable</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#aaf537a84cd9b119ed5f77888c600f2ae">isSleeping</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#aff7105f65905ecb9f6f7993ba28b712d">mBaseFlags</a></td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a1abe9d19726e1f8f8e56abeb2269b5d4">mConcreteType</a></td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#aae00aa2067a2fe268b999aad04f27c28">putToSleep</a>()=0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a05d86747e2fc14c89497fa06c8f0f252">PxActor</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#a1780362a59125a7d22619937b2d7a511">PxActor</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a263f1ef821c74058ce9335e06cb842a4">PxBase</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#a58b979202e7382701f74977fe3dd5c6f">PxBase</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#aea61d8192d25e64cefbbdc5f6d68a39b">PxRigidActor</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#abc3690608611b90d1efd033e804fd774">PxRigidActor</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#adf18dc769f8d1d3da7760097fa699acb">PxRigidBody</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a14d4e7068063768f6029a975ff5d41e4">PxRigidBody</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">PxRigidDynamic</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">PxRigidDynamic</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#a1130f92afcba590b04b1e51b86f5377a">release</a>()=0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#ae24ff3f3ed0cb2a138b382fd0720b94f">setActorFlag</a>(PxActorFlag::Enum flag, bool value)=0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#af60720e190324e8ff36281a2360c6043">setActorFlags</a>(PxActorFlags inFlags)=0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#adb8f357ce212cdf05250237073278d0c">setAngularDamping</a>(PxReal angDamp)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#ad49850630db14af26e019d2550ecfd27">setAngularVelocity</a>(const PxVec3 &amp;angVel, bool autowake=true)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxBase.html#a1dd20bb574075df904e1878f2e771ef9">setBaseFlag</a>(PxBaseFlag::Enum flag, bool value)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#afb391b692c561df0a54c413785e4ebd0">setBaseFlags</a>(PxBaseFlags inFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#ab152773926fe7b222d61e982c3cb6adf">setCMassLocalPose</a>(const PxTransform &amp;pose)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#abfd510964f2287e81fe76a1a9e1725b5">setContactReportThreshold</a>(PxReal threshold)=0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxActor.html#a614c46687cf76eb219ce47927fc90824">setDominanceGroup</a>(PxDominanceGroup dominanceGroup)=0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#ae1cf856d4059ff6b422b2c58f91d242f">setForceAndTorque</a>(const PxVec3 &amp;force, const PxVec3 &amp;torque, PxForceMode::Enum mode=PxForceMode::eFORCE)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#a26994d9594ed9a04bd412bdeb2a55f3e">setGlobalPose</a>(const PxTransform &amp;pose, bool autowake=true)=0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371">setKinematicSurfaceVelocity</a>(const PxVec3 &amp;linearVelocity, const PxVec3 &amp;angularVelocity)=0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#ae251132385cc0b082b612d709aa4375d">clearTorque</a>(PxForceMode::Enum mode=PxForceMode::eFORCE)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#a467f9dc1517978b8d517936a475d3c4c">detachShape</a>(PxShape &amp;shape, bool wakeOnLostTouch=true)=0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#afb92fb228de76bbcd42210d060b558d1">getActorFlags</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxActor.html#aaedfdd3e9acc77afc2fc0ba51d968876">getAggregate</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#aaffbfa748d3a75e79a13dbca92aaa7ff">getAngularDamping</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#af91b92d6fcf47103b148337749aa93e0">getAngularVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#ad9b10efae7733ff9c8b8d3b7d90a9a5f">getBaseFlags</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#aafdbdab1865112b15201aeabb23877b4">getCMassLocalPose</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#ab4e0691a08e49631dc60014d49c7585f">getConcreteType</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527">getConcreteTypeName</a>() const</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#ac88777d5457f87247f5625094eb2be57">getConstraints</a>(PxConstraint **userBuffer, PxU32 bufferSize, PxU32 startIndex=0) const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#acc91baffa652090e523df5e6627686ed">getContactReportThreshold</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a24602fee489b3e797ff0f82613eef755">getDominanceGroup</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#a6eb5e3d590e6087c930f8141d40de722">getGlobalPose</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a238775889dd7a555d700fd8cd65623e9">getInternalIslandNodeIndex</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a4c82bd72a216c0460887cf94184560bd">getInvMass</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a3b6ed86370cfd04958ca7e1c637e310b">getKinematicTarget</a>(PxTransform &amp;target) const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#ad7fb785dde2e8e0954db807c7ec69a26">getLinearDamping</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a04bab22deecb716e2cdd7a64b5cfaee7">getLinearVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a2b1c475ca9cc6aebc168ac58256b7284">getMass</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a063cce94190de44d86a15c1b49dd7f71">getMassSpaceInertiaTensor</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a63830b4cd24cd6e5e9f5bb2d6a1d3dc3">getMassSpaceInvInertiaTensor</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a8052320915132f7d7f72ede4543b64ad">getMaxAngularVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a2f9e9e7946ab265013229612aad4c3c7">getMaxContactImpulse</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a3304d26a39bbdd99dc0ad51d1d99aec4">getMaxDepenetrationVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a00f7fee01b5d357fdeb840ee4322c7a4">getMaxLinearVelocity</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#abae787a717c8468dbf67345dde14ba85">getMinCCDAdvanceCoefficient</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxActor.html#a1ae86d992066567c5d6bca5fe2dc656e">getName</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#ad941cfa63351e7ff6095a4afbe2d9172">getNbConstraints</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#ae5c3534de94642c5980d29f4c49bc112">getNbShapes</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a660828fb83c91f0ada2ca43e06aa2b35">getOwnerClient</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a2ef3c66318db72f2dd2c1b0d20513f18">getRigidBodyFlags</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#aa88e055440d6c1cc846e32dbed017345">getRigidDynamicLockFlags</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxActor.html#a2bf2e48b1960ef8e4fc1a437473bd7a3">getScene</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#a22ef86619d7a6de688b9ef2b0ad7bcec">getShapes</a>(PxShape **userBuffer, PxU32 bufferSize, PxU32 startIndex=0) const =0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a1a0232d5a392e204e6b133fd2ab0879e">getSleepThreshold</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a2fcdafcbc0dd5a691d502b2f7b03a7d7">getSolverIterationCounts</a>(PxU32 &amp;minPositionIters, PxU32 &amp;minVelocityIters) const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a5f9d98c9bd3b9ab0be7ce1b3ec4a4574">getStabilizationThreshold</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a76a3489218f85ce9fe3ffb883ba8ce63">getType</a>() const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a88dc31fa470c5f37e3e9683534ad83aa">getWakeCounter</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#acab89ce3ee6d27b580063d131a417721">getWorldBounds</a>(float inflation=1.01f) const =0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxBase.html#a4bc720adef86ea68bfb5845f0e104d23">is</a>()</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a85aa50ac0e5670a2be64411fea62b5e3">is</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1">isKindOf</a>(const char *name) const</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a38952fcb80e99f1f5dbf774a75ac2c3d">isReleasable</a>() const</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#aaf537a84cd9b119ed5f77888c600f2ae">isSleeping</a>() const =0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#aff7105f65905ecb9f6f7993ba28b712d">mBaseFlags</a></td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxBase.html#a1abe9d19726e1f8f8e56abeb2269b5d4">mConcreteType</a></td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#aae00aa2067a2fe268b999aad04f27c28">putToSleep</a>()=0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxActor.html#a05d86747e2fc14c89497fa06c8f0f252">PxActor</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a1780362a59125a7d22619937b2d7a511">PxActor</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxBase.html#a263f1ef821c74058ce9335e06cb842a4">PxBase</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a58b979202e7382701f74977fe3dd5c6f">PxBase</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#aea61d8192d25e64cefbbdc5f6d68a39b">PxRigidActor</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#abc3690608611b90d1efd033e804fd774">PxRigidActor</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#adf18dc769f8d1d3da7760097fa699acb">PxRigidBody</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#a14d4e7068063768f6029a975ff5d41e4">PxRigidBody</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">PxRigidDynamic</a>(PxType concreteType, PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">PxRigidDynamic</a>(PxBaseFlags baseFlags)</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidActor.html#a1130f92afcba590b04b1e51b86f5377a">release</a>()=0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#ae24ff3f3ed0cb2a138b382fd0720b94f">setActorFlag</a>(PxActorFlag::Enum flag, bool value)=0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxActor.html#af60720e190324e8ff36281a2360c6043">setActorFlags</a>(PxActorFlags inFlags)=0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#adb8f357ce212cdf05250237073278d0c">setAngularDamping</a>(PxReal angDamp)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#ad49850630db14af26e019d2550ecfd27">setAngularVelocity</a>(const PxVec3 &amp;angVel, bool autowake=true)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxBase.html#a1dd20bb574075df904e1878f2e771ef9">setBaseFlag</a>(PxBaseFlag::Enum flag, bool value)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxBase.html#afb391b692c561df0a54c413785e4ebd0">setBaseFlags</a>(PxBaseFlags inFlags)</td><td class="entry"><a class="el" href="classPxBase.html">PxBase</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#ab152773926fe7b222d61e982c3cb6adf">setCMassLocalPose</a>(const PxTransform &amp;pose)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#abfd510964f2287e81fe76a1a9e1725b5">setContactReportThreshold</a>(PxReal threshold)=0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxActor.html#a614c46687cf76eb219ce47927fc90824">setDominanceGroup</a>(PxDominanceGroup dominanceGroup)=0</td><td class="entry"><a class="el" href="classPxActor.html">PxActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr><td class="entry"><a class="el" href="classPxRigidBody.html#ae1cf856d4059ff6b422b2c58f91d242f">setForceAndTorque</a>(const PxVec3 &amp;force, const PxVec3 &amp;torque, PxForceMode::Enum mode=PxForceMode::eFORCE)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classPxRigidActor.html#a26994d9594ed9a04bd412bdeb2a55f3e">setGlobalPose</a>(const PxTransform &amp;pose, bool autowake=true)=0</td><td class="entry"><a class="el" href="classPxRigidActor.html">PxRigidActor</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
   <tr><td class="entry"><a class="el" href="classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93">setKinematicTarget</a>(const PxTransform &amp;destination)=0</td><td class="entry"><a class="el" href="classPxRigidDynamic.html">PxRigidDynamic</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classPxRigidBody.html#ac81d09f36de3f2d1984709b559bb74ff">setLinearDamping</a>(PxReal linDamp)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
   <tr><td class="entry"><a class="el" href="classPxRigidBody.html#a0aed51d5ddcf81b09a104ad7f0f30c05">setLinearVelocity</a>(const PxVec3 &amp;linVel, bool autowake=true)=0</td><td class="entry"><a class="el" href="classPxRigidBody.html">PxRigidBody</a></td><td class="entry"><span class="mlabel">pure virtual</span></td></tr>
diff --git a/physx/documentation/PhysXAPI/files/classPxRigidDynamic.html b/physx/documentation/PhysXAPI/files/classPxRigidDynamic.html
index 93a4f0845..c2bd87651 100644
--- a/physx/documentation/PhysXAPI/files/classPxRigidDynamic.html
+++ b/physx/documentation/PhysXAPI/files/classPxRigidDynamic.html
@@ -117,12 +117,6 @@
 <tr class="memitem:a3b6ed86370cfd04958ca7e1c637e310b"><td class="memItemLeft" align="right" valign="top">virtual bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxRigidDynamic.html#a3b6ed86370cfd04958ca7e1c637e310b">getKinematicTarget</a> (<a class="el" href="classPxTransform.html">PxTransform</a> &amp;target) const =0</td></tr>
 <tr class="memdesc:a3b6ed86370cfd04958ca7e1c637e310b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Get target pose of a kinematically controlled dynamic actor.  <a href="#a3b6ed86370cfd04958ca7e1c637e310b">More...</a><br /></td></tr>
 <tr class="separator:a3b6ed86370cfd04958ca7e1c637e310b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7cfb0ebc9f4ae6ec44391b381fe9e371"><td class="memItemLeft" align="right" valign="top">virtual void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371">setKinematicSurfaceVelocity</a> (const <a class="el" href="classPxVec3.html">PxVec3</a> &amp;linearVelocity, const <a class="el" href="classPxVec3.html">PxVec3</a> &amp;angularVelocity)=0</td></tr>
-<tr class="memdesc:a7cfb0ebc9f4ae6ec44391b381fe9e371"><td class="mdescLeft">&#160;</td><td class="mdescRight">Sets a surface velocity on a kinematic actor. This velocity is responded to through the solver but the kinematic actor will not move.  <a href="#a7cfb0ebc9f4ae6ec44391b381fe9e371">More...</a><br /></td></tr>
-<tr class="separator:a7cfb0ebc9f4ae6ec44391b381fe9e371"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae6df52fe5966a6b0f421735147b7bf6c"><td class="memItemLeft" align="right" valign="top">virtual void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c">clearKinematicSurfaceVelocity</a> ()=0</td></tr>
-<tr class="memdesc:ae6df52fe5966a6b0f421735147b7bf6c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Clears kinematic surface velocity.  <a href="#ae6df52fe5966a6b0f421735147b7bf6c">More...</a><br /></td></tr>
-<tr class="separator:ae6df52fe5966a6b0f421735147b7bf6c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="inherit_header pub_methods_classPxRigidBody"><td colspan="2" onclick="javascript:toggleInherit('pub_methods_classPxRigidBody')"><img src="closed.png" alt="-"/>&#160;Public Member Functions inherited from <a class="el" href="classPxRigidBody.html">PxRigidBody</a></td></tr>
 <tr class="memitem:ab152773926fe7b222d61e982c3cb6adf inherit pub_methods_classPxRigidBody"><td class="memItemLeft" align="right" valign="top">virtual void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classPxRigidBody.html#ab152773926fe7b222d61e982c3cb6adf">setCMassLocalPose</a> (const <a class="el" href="classPxTransform.html">PxTransform</a> &amp;pose)=0</td></tr>
 <tr class="memdesc:ab152773926fe7b222d61e982c3cb6adf inherit pub_methods_classPxRigidBody"><td class="mdescLeft">&#160;</td><td class="mdescRight">Sets the pose of the center of mass relative to the actor.  <a href="classPxRigidBody.html#ab152773926fe7b222d61e982c3cb6adf">More...</a><br /></td></tr>
@@ -527,34 +521,6 @@ <h2 class="memtitle"><span class="permalink"><a href="#a9380b2f317161026b39c6a29
 </div>
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
-<a id="ae6df52fe5966a6b0f421735147b7bf6c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ae6df52fe5966a6b0f421735147b7bf6c">&#9670;&nbsp;</a></span>clearKinematicSurfaceVelocity()</h2>
-
-<div class="memitem">
-<div class="memproto">
-<table class="mlabels">
-  <tr>
-  <td class="mlabels-left">
-      <table class="memname">
-        <tr>
-          <td class="memname">virtual void PxRigidDynamic::clearKinematicSurfaceVelocity </td>
-          <td>(</td>
-          <td class="paramname"></td><td>)</td>
-          <td></td>
-        </tr>
-      </table>
-  </td>
-  <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">pure virtual</span></span>  </td>
-  </tr>
-</table>
-</div><div class="memdoc">
-
-<p>Clears kinematic surface velocity. </p>
-<p>Only has effect on kinematic actors. Will raise a warning if it is called on non-kinematic actors. </p>
-
-</div>
-</div>
 <a id="a43150714d9e16cd056149bad325f0527"></a>
 <h2 class="memtitle"><span class="permalink"><a href="#a43150714d9e16cd056149bad325f0527">&#9670;&nbsp;</a></span>getConcreteTypeName()</h2>
 
@@ -957,53 +923,6 @@ <h2 class="memtitle"><span class="permalink"><a href="#abfd510964f2287e81fe76a1a
 </dl>
 <dl class="section see"><dt>See also</dt><dd><a class="el" href="classPxRigidDynamic.html#acc91baffa652090e523df5e6627686ed" title="Retrieves the force threshold for contact reports. ">getContactReportThreshold</a> <a class="el" href="structPxPairFlag.html" title="Collection of flags describing the actions to take for a collision pair. ">PxPairFlag</a> </dd></dl>
 
-</div>
-</div>
-<a id="a7cfb0ebc9f4ae6ec44391b381fe9e371"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7cfb0ebc9f4ae6ec44391b381fe9e371">&#9670;&nbsp;</a></span>setKinematicSurfaceVelocity()</h2>
-
-<div class="memitem">
-<div class="memproto">
-<table class="mlabels">
-  <tr>
-  <td class="mlabels-left">
-      <table class="memname">
-        <tr>
-          <td class="memname">virtual void PxRigidDynamic::setKinematicSurfaceVelocity </td>
-          <td>(</td>
-          <td class="paramtype">const <a class="el" href="classPxVec3.html">PxVec3</a> &amp;&#160;</td>
-          <td class="paramname"><em>linearVelocity</em>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const <a class="el" href="classPxVec3.html">PxVec3</a> &amp;&#160;</td>
-          <td class="paramname"><em>angularVelocity</em>&#160;</td>
-        </tr>
-        <tr>
-          <td></td>
-          <td>)</td>
-          <td></td><td></td>
-        </tr>
-      </table>
-  </td>
-  <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">pure virtual</span></span>  </td>
-  </tr>
-</table>
-</div><div class="memdoc">
-
-<p>Sets a surface velocity on a kinematic actor. This velocity is responded to through the solver but the kinematic actor will not move. </p>
-<p>Only has effect on kinematic actors. Will raise a warning if it is called on non-kinematic actors.</p>
-<dl class="params"><dt>Parameters</dt><dd>
-  <table class="params">
-    <tr><td class="paramdir">[in]</td><td class="paramname">linearVelocity</td><td>The target linear velocity </td></tr>
-    <tr><td class="paramdir">[in]</td><td class="paramname">angularVelocity</td><td>The target angular velocity</td></tr>
-  </table>
-  </dd>
-</dl>
-<p>These velocities will be retained until either a kinematic target is set of clearKinematicSurfaceVelocity is set. </p>
-
 </div>
 </div>
 <a id="a4464d188e7a1e94582c9cf35da9bbc93"></a>
diff --git a/physx/documentation/PhysXAPI/files/classPxRigidDynamic.js b/physx/documentation/PhysXAPI/files/classPxRigidDynamic.js
index 834cebc9e..bdbaf49e3 100644
--- a/physx/documentation/PhysXAPI/files/classPxRigidDynamic.js
+++ b/physx/documentation/PhysXAPI/files/classPxRigidDynamic.js
@@ -3,7 +3,6 @@ var classPxRigidDynamic =
     [ "PxRigidDynamic", "classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064", null ],
     [ "PxRigidDynamic", "classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef", null ],
     [ "~PxRigidDynamic", "classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273", null ],
-    [ "clearKinematicSurfaceVelocity", "classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c", null ],
     [ "getConcreteTypeName", "classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527", null ],
     [ "getContactReportThreshold", "classPxRigidDynamic.html#acc91baffa652090e523df5e6627686ed", null ],
     [ "getKinematicTarget", "classPxRigidDynamic.html#a3b6ed86370cfd04958ca7e1c637e310b", null ],
@@ -16,7 +15,6 @@ var classPxRigidDynamic =
     [ "isSleeping", "classPxRigidDynamic.html#aaf537a84cd9b119ed5f77888c600f2ae", null ],
     [ "putToSleep", "classPxRigidDynamic.html#aae00aa2067a2fe268b999aad04f27c28", null ],
     [ "setContactReportThreshold", "classPxRigidDynamic.html#abfd510964f2287e81fe76a1a9e1725b5", null ],
-    [ "setKinematicSurfaceVelocity", "classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371", null ],
     [ "setKinematicTarget", "classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93", null ],
     [ "setRigidDynamicLockFlag", "classPxRigidDynamic.html#a7cf3e84117da3ec5b499262a3a9f5521", null ],
     [ "setRigidDynamicLockFlags", "classPxRigidDynamic.html#a16f9f0dfeae6e7877bcebca80df42c92", null ],
diff --git a/physx/documentation/PhysXAPI/files/functions_c.html b/physx/documentation/PhysXAPI/files/functions_c.html
index bcad99b02..ce649d1a3 100644
--- a/physx/documentation/PhysXAPI/files/functions_c.html
+++ b/physx/documentation/PhysXAPI/files/functions_c.html
@@ -123,9 +123,6 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 <li>clearForce()
 : <a class="el" href="classPxRigidBody.html#aefe5174aa160a07d488de2a18ba61f94">PxRigidBody</a>
 </li>
-<li>clearKinematicSurfaceVelocity()
-: <a class="el" href="classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c">PxRigidDynamic</a>
-</li>
 <li>clearTessFlag()
 : <a class="el" href="structPxHeightFieldSample.html#aa28e24b4763400eb5bd84378a23e086f">PxHeightFieldSample</a>
 </li>
@@ -428,7 +425,7 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 : <a class="el" href="classPxScene.html#a285849425521dd0344f55b4a7c0a834a">PxScene</a>
 </li>
 <li>createCollection()
-: <a class="el" href="classPxCollectionExt.html#a39dbd32faa242e01117cd4ecee3fd2e7">PxCollectionExt</a>
+: <a class="el" href="classPxCollectionExt.html#ab3e2ad3cfe56c756d78d3a071a064ecf">PxCollectionExt</a>
 </li>
 <li>createCollectionFromBinary()
 : <a class="el" href="classPxSerialization.html#a6c64dc9a55f7ae9ffc8c1802855134ed">PxSerialization</a>
@@ -453,7 +450,7 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 : <a class="el" href="classPxArticulation.html#a795fd4cd7bb40e05c9e9bfd6f0d73a70">PxArticulation</a>
 </li>
 <li>createExclusiveShape()
-: <a class="el" href="classPxRigidActorExt.html#ae708f5ca3fc5f1ce3db14032a89f62b9">PxRigidActorExt</a>
+: <a class="el" href="classPxRigidActorExt.html#a83136f2febcf60ac70f9da3664d1880f">PxRigidActorExt</a>
 </li>
 <li>createHeightField()
 : <a class="el" href="classPxCooking.html#a0a3d0ec6145be91dd6707535724d8ffb">PxCooking</a>
@@ -511,7 +508,7 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 , <a class="el" href="classPxPhysics.html#a29dcbbe128cef11eaf4ce5696c216dfd">PxPhysics</a>
 </li>
 <li>cross()
-: <a class="el" href="structPxExtendedVec3.html#ac2c9851ce34c21f697801eead6e083e2">PxExtendedVec3</a>
+: <a class="el" href="structPxExtendedVec3.html#a9eb2896468ae9bb29e755ce9cabebc09">PxExtendedVec3</a>
 , <a class="el" href="classPxVec3.html#adc865d00a152264bd992dc0656ec0532">PxVec3</a>
 </li>
 <li>currPtr
diff --git a/physx/documentation/PhysXAPI/files/functions_e.html b/physx/documentation/PhysXAPI/files/functions_e.html
index d14533807..6f4f88965 100644
--- a/physx/documentation/PhysXAPI/files/functions_e.html
+++ b/physx/documentation/PhysXAPI/files/functions_e.html
@@ -1052,8 +1052,7 @@ <h3><a id="index_e"></a>- e -</h3><ul>
 : <a class="el" href="structPxVehicleTypes.html#a8e3dab2de8d47cbd9d22c8e27af4bacba55b0f71d4462431b819f71266e0044e6">PxVehicleTypes</a>
 </li>
 <li>eNONE
-: <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6">PxArticulationJointDriveType</a>
-, <a class="el" href="structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef">PxConstraintSolveHint</a>
+: <a class="el" href="structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef">PxConstraintSolveHint</a>
 , <a class="el" href="structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9cab6869486b64cea747bc34cec4403c261">PxControllerDebugRenderFlag</a>
 , <a class="el" href="structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a5ebeeface34f3fdc582dffcfb718ec74">PxConverterReportMode</a>
 , <a class="el" href="structPxPruningStructureType.html#a058454782f6ed1cc953b8d6561b636d9a667e19208a1fd5b2215e452ac9754b6e">PxPruningStructureType</a>
@@ -1477,9 +1476,6 @@ <h3><a id="index_e"></a>- e -</h3><ul>
 <li>eTARGET
 : <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">PxArticulationJointDriveType</a>
 </li>
-<li>eTARGETVELOCITY
-: <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e">PxArticulationJointDriveType</a>
-</li>
 <li>eTEMPORAL_BV
 : <a class="el" href="structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9caebb20da3030ba4d86cc768d774fa21ae">PxControllerDebugRenderFlag</a>
 </li>
diff --git a/physx/documentation/PhysXAPI/files/functions_eval_e.html b/physx/documentation/PhysXAPI/files/functions_eval_e.html
index a1cb5871a..a422c5353 100644
--- a/physx/documentation/PhysXAPI/files/functions_eval_e.html
+++ b/physx/documentation/PhysXAPI/files/functions_eval_e.html
@@ -1040,8 +1040,7 @@ <h3><a id="index_e"></a>- e -</h3><ul>
 : <a class="el" href="structPxVehicleTypes.html#a8e3dab2de8d47cbd9d22c8e27af4bacba55b0f71d4462431b819f71266e0044e6">PxVehicleTypes</a>
 </li>
 <li>eNONE
-: <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6">PxArticulationJointDriveType</a>
-, <a class="el" href="structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef">PxConstraintSolveHint</a>
+: <a class="el" href="structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef">PxConstraintSolveHint</a>
 , <a class="el" href="structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9cab6869486b64cea747bc34cec4403c261">PxControllerDebugRenderFlag</a>
 , <a class="el" href="structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a5ebeeface34f3fdc582dffcfb718ec74">PxConverterReportMode</a>
 , <a class="el" href="structPxPruningStructureType.html#a058454782f6ed1cc953b8d6561b636d9a667e19208a1fd5b2215e452ac9754b6e">PxPruningStructureType</a>
@@ -1365,9 +1364,6 @@ <h3><a id="index_e"></a>- e -</h3><ul>
 <li>eTARGET
 : <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">PxArticulationJointDriveType</a>
 </li>
-<li>eTARGETVELOCITY
-: <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e">PxArticulationJointDriveType</a>
-</li>
 <li>eTEMPORAL_BV
 : <a class="el" href="structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9caebb20da3030ba4d86cc768d774fa21ae">PxControllerDebugRenderFlag</a>
 </li>
diff --git a/physx/documentation/PhysXAPI/files/functions_func_c.html b/physx/documentation/PhysXAPI/files/functions_func_c.html
index d161badbc..870260f0d 100644
--- a/physx/documentation/PhysXAPI/files/functions_func_c.html
+++ b/physx/documentation/PhysXAPI/files/functions_func_c.html
@@ -111,9 +111,6 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 <li>clearForce()
 : <a class="el" href="classPxRigidBody.html#aefe5174aa160a07d488de2a18ba61f94">PxRigidBody</a>
 </li>
-<li>clearKinematicSurfaceVelocity()
-: <a class="el" href="classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c">PxRigidDynamic</a>
-</li>
 <li>clearTessFlag()
 : <a class="el" href="structPxHeightFieldSample.html#aa28e24b4763400eb5bd84378a23e086f">PxHeightFieldSample</a>
 </li>
@@ -273,7 +270,7 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 : <a class="el" href="classPxScene.html#a285849425521dd0344f55b4a7c0a834a">PxScene</a>
 </li>
 <li>createCollection()
-: <a class="el" href="classPxCollectionExt.html#a39dbd32faa242e01117cd4ecee3fd2e7">PxCollectionExt</a>
+: <a class="el" href="classPxCollectionExt.html#ab3e2ad3cfe56c756d78d3a071a064ecf">PxCollectionExt</a>
 </li>
 <li>createCollectionFromBinary()
 : <a class="el" href="classPxSerialization.html#a6c64dc9a55f7ae9ffc8c1802855134ed">PxSerialization</a>
@@ -298,7 +295,7 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 : <a class="el" href="classPxArticulation.html#a795fd4cd7bb40e05c9e9bfd6f0d73a70">PxArticulation</a>
 </li>
 <li>createExclusiveShape()
-: <a class="el" href="classPxRigidActorExt.html#ae708f5ca3fc5f1ce3db14032a89f62b9">PxRigidActorExt</a>
+: <a class="el" href="classPxRigidActorExt.html#a83136f2febcf60ac70f9da3664d1880f">PxRigidActorExt</a>
 </li>
 <li>createHeightField()
 : <a class="el" href="classPxCooking.html#a0a3d0ec6145be91dd6707535724d8ffb">PxCooking</a>
@@ -356,7 +353,7 @@ <h3><a id="index_c"></a>- c -</h3><ul>
 , <a class="el" href="classPxPhysics.html#a29dcbbe128cef11eaf4ce5696c216dfd">PxPhysics</a>
 </li>
 <li>cross()
-: <a class="el" href="structPxExtendedVec3.html#a77717e0c9d3089c740ca47499cb4663d">PxExtendedVec3</a>
+: <a class="el" href="structPxExtendedVec3.html#af3f6403e17d5c180a05206477fe6d879">PxExtendedVec3</a>
 , <a class="el" href="classPxVec3.html#adc865d00a152264bd992dc0656ec0532">PxVec3</a>
 </li>
 </ul>
diff --git a/physx/documentation/PhysXAPI/files/functions_func_p.html b/physx/documentation/PhysXAPI/files/functions_func_p.html
index 6700131f6..b8fcaa0f5 100644
--- a/physx/documentation/PhysXAPI/files/functions_func_p.html
+++ b/physx/documentation/PhysXAPI/files/functions_func_p.html
@@ -737,13 +737,13 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxVehicleWheelsDynData.html#a471098b8f71edd2aa6826422c245d091">PxVehicleWheelsDynData</a>
 </li>
 <li>PxVehicleWheelsSimData()
-: <a class="el" href="classPxVehicleWheelsSimData.html#a6739c902ff9828abceace89afd02fb81">PxVehicleWheelsSimData</a>
+: <a class="el" href="classPxVehicleWheelsSimData.html#a9fbfa747a9ef9fe0cc1deef89a5b1238">PxVehicleWheelsSimData</a>
 </li>
 <li>PxWheelQueryResult()
 : <a class="el" href="structPxWheelQueryResult.html#a161b38c47540b17befebc36c02c8e996">PxWheelQueryResult</a>
 </li>
 <li>PxXmlMiscParameter()
-: <a class="el" href="structPxSerialization_1_1PxXmlMiscParameter.html#aa23353942908b5783b0d7aebe4ce3205">PxSerialization::PxXmlMiscParameter</a>
+: <a class="el" href="structPxSerialization_1_1PxXmlMiscParameter.html#ab7f05bbfdf68b9cd2445508f40d9a6f1">PxSerialization::PxXmlMiscParameter</a>
 </li>
 </ul>
 </div><!-- contents -->
diff --git a/physx/documentation/PhysXAPI/files/functions_func_s.html b/physx/documentation/PhysXAPI/files/functions_func_s.html
index b2259d04b..b0bb4c8b7 100644
--- a/physx/documentation/PhysXAPI/files/functions_func_s.html
+++ b/physx/documentation/PhysXAPI/files/functions_func_s.html
@@ -174,6 +174,9 @@ <h3><a id="index_s"></a>- s -</h3><ul>
 <li>setAntiRollBarData()
 : <a class="el" href="classPxVehicleWheelsSimData.html#ae8706a8f39a630ba469414fb814fe206">PxVehicleWheelsSimData</a>
 </li>
+<li>setArticulationFlag()
+: <a class="el" href="classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4">PxArticulationReducedCoordinate</a>
+</li>
 <li>setArticulationFlags()
 : <a class="el" href="classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564">PxArticulationReducedCoordinate</a>
 </li>
@@ -493,9 +496,6 @@ <h3><a id="index_s"></a>- s -</h3><ul>
 <li>setJointType()
 : <a class="el" href="classPxArticulationJointReducedCoordinate.html#ad6983182f5ea8c1e043c0f7e1ec3835b">PxArticulationJointReducedCoordinate</a>
 </li>
-<li>setKinematicSurfaceVelocity()
-: <a class="el" href="classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371">PxRigidDynamic</a>
-</li>
 <li>setKinematicTarget()
 : <a class="el" href="classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93">PxRigidDynamic</a>
 </li>
diff --git a/physx/documentation/PhysXAPI/files/functions_h.html b/physx/documentation/PhysXAPI/files/functions_h.html
index 860a62aba..43229ed79 100644
--- a/physx/documentation/PhysXAPI/files/functions_h.html
+++ b/physx/documentation/PhysXAPI/files/functions_h.html
@@ -126,9 +126,6 @@ <h3><a id="index_h"></a>- h -</h3><ul>
 <li>hasNextPatch()
 : <a class="el" href="structPxContactStreamIterator.html#ac4823a352cbda39e7e434df43f5838f6">PxContactStreamIterator</a>
 </li>
-<li>hasSurfaceVelocity
-: <a class="el" href="structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f">PxSolverBodyData</a>
-</li>
 <li>heapCapacity
 : <a class="el" href="structPxgDynamicsMemoryConfig.html#a548423b552a06d77a3c0066d768a4dd8">PxgDynamicsMemoryConfig</a>
 </li>
@@ -137,14 +134,13 @@ <h3><a id="index_h"></a>- h -</h3><ul>
 , <a class="el" href="structPxHeightFieldSample.html#acfdb9fa8a4f56b5c3354cd552768a026">PxHeightFieldSample</a>
 </li>
 <li>heightField()
-: <a class="el" href="classPxGeometryHolder.html#a94cd64bea37591f56a230b22a8272042">PxGeometryHolder</a>
+: <a class="el" href="classPxGeometryHolder.html#aa93f73423d7bd5d0a528df4ba8c51ada">PxGeometryHolder</a>
 </li>
 <li>heightfield
 : <a class="el" href="classPxGeometryHolder.html#a13086e315b9865701c5d4c03031723ca">PxGeometryHolder</a>
 </li>
-<li>heightField()
-: <a class="el" href="classPxGeometryHolder.html#aa93f73423d7bd5d0a528df4ba8c51ada">PxGeometryHolder</a>
-, <a class="el" href="classPxHeightFieldGeometry.html#ade90caef2f95d56481d625ffba6ad918">PxHeightFieldGeometry</a>
+<li>heightField
+: <a class="el" href="classPxHeightFieldGeometry.html#ade90caef2f95d56481d625ffba6ad918">PxHeightFieldGeometry</a>
 </li>
 <li>heightFieldFlags
 : <a class="el" href="classPxHeightFieldGeometry.html#aa86c6a6354ec839a1300f7bd4e5141ea">PxHeightFieldGeometry</a>
diff --git a/physx/documentation/PhysXAPI/files/functions_p.html b/physx/documentation/PhysXAPI/files/functions_p.html
index a9a67e8aa..cf8caddaf 100644
--- a/physx/documentation/PhysXAPI/files/functions_p.html
+++ b/physx/documentation/PhysXAPI/files/functions_p.html
@@ -95,6 +95,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classphysx_1_1Gu_1_1ContactBuffer.html#a4f8fabbb2c9690d8fb30f8b6b1459294">physx::Gu::ContactBuffer</a>
 , <a class="el" href="structPxBatchQueryResult.html#ad0b4a79dc77732dc7bbe5c56f50bd4a6">PxBatchQueryResult&lt; HitType &gt;</a>
 , <a class="el" href="structPxRigidBodyData.html#a1bb93cacad36fcc17492afe83718f8c7">PxRigidBodyData</a>
+, <a class="el" href="structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7">PxSolverBodyData</a>
 , <a class="el" href="structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be">PxSolverContactDesc</a>
 , <a class="el" href="classPxVehicleChassisData.html#a7e37b2a81f834c76e9b00bf9b7358421">PxVehicleChassisData</a>
 </li>
@@ -351,7 +352,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxBoxController.html#a5182b7d6bd38b10d31ecdc6453b0ecbe">PxBoxController</a>
 </li>
 <li>PxBoxControllerDesc()
-: <a class="el" href="group__character.html#gac1aba7d97780ef07724bbb1ec1a7514c">PxBoxControllerDesc</a>
+: <a class="el" href="group__character.html#ga5d8e29dacee8a0a6543029435c0d5dc6">PxBoxControllerDesc</a>
 </li>
 <li>PxBoxGeometry()
 : <a class="el" href="classPxBoxGeometry.html#aab5633a2c4280cc591bf9f13902405c2">PxBoxGeometry</a>
@@ -584,10 +585,10 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="structPxLocationHit.html#a9dce7a1e378082e86859350601591072">PxLocationHit</a>
 </li>
 <li>PxMassProperties()
-: <a class="el" href="classPxMassProperties.html#acc8eadc90e93df0e25d4dfb63988a596">PxMassProperties</a>
+: <a class="el" href="classPxMassProperties.html#a5a7eadca391b0cd981c0b35e08c82662">PxMassProperties</a>
 </li>
 <li>PxMat33()
-: <a class="el" href="classPxMat33.html#a92525fe88727f40493acfd0857680fc4">PxMat33</a>
+: <a class="el" href="classPxMat33.html#a3264dcb23397993b4a861b9e4ddf55ff">PxMat33</a>
 </li>
 <li>PxMat44()
 : <a class="el" href="classPxMat44.html#a7f9ea125461778278eb8718761d65870">PxMat44</a>
@@ -632,13 +633,13 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classphysx_1_1PxProfileScoped.html#aa3d0dccafd91ed3513dbf1d79f748b95">physx::PxProfileScoped</a>
 </li>
 <li>PxPruningStructure()
-: <a class="el" href="classPxPruningStructure.html#a034b569b8e1db318efcb97fa3dc03020">PxPruningStructure</a>
+: <a class="el" href="classPxPruningStructure.html#a2123cf6d0ec8f856c4b8cdf7c3693326">PxPruningStructure</a>
 </li>
 <li>PxQuat()
 : <a class="el" href="classPxQuat.html#afac7f63b57840965157cf720bb6faddc">PxQuat</a>
 </li>
 <li>PxQueryCache()
-: <a class="el" href="structPxQueryCache.html#aec86c57516de749a1e2b658a8cb5e5bd">PxQueryCache</a>
+: <a class="el" href="structPxQueryCache.html#a810ecb74536e940be1719d60aedfd3cd">PxQueryCache</a>
 </li>
 <li>PxQueryFilterData()
 : <a class="el" href="structPxQueryFilterData.html#a589e393317001f2effd196324ee459ed">PxQueryFilterData</a>
@@ -662,16 +663,16 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxRevoluteJoint.html#a298f032a11365475f16d6d2b42c5dc61">PxRevoluteJoint</a>
 </li>
 <li>PxRigidActor()
-: <a class="el" href="classPxRigidActor.html#abc3690608611b90d1efd033e804fd774">PxRigidActor</a>
+: <a class="el" href="classPxRigidActor.html#aea61d8192d25e64cefbbdc5f6d68a39b">PxRigidActor</a>
 </li>
 <li>PxRigidBody()
 : <a class="el" href="classPxRigidBody.html#adf18dc769f8d1d3da7760097fa699acb">PxRigidBody</a>
 </li>
 <li>PxRigidDynamic()
-: <a class="el" href="classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef">PxRigidDynamic</a>
+: <a class="el" href="classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064">PxRigidDynamic</a>
 </li>
 <li>PxRigidStatic()
-: <a class="el" href="classPxRigidStatic.html#a33f405d68b5ec1e8e60d0b361151645c">PxRigidStatic</a>
+: <a class="el" href="classPxRigidStatic.html#a7881e50c535f44a6d69f40af3baa7ffc">PxRigidStatic</a>
 </li>
 <li>PxScene()
 : <a class="el" href="classPxScene.html#a37b5f1aed7edbae16640e0555d9271ae">PxScene</a>
@@ -695,7 +696,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxSerializerDefaultAdapter.html#a3a59a446ea4fc277d0c224881c2f8658">PxSerializerDefaultAdapter&lt; T &gt;</a>
 </li>
 <li>PxShape()
-: <a class="el" href="classPxShape.html#aa7a1ba5182584faa6bddb15e4b3465c3">PxShape</a>
+: <a class="el" href="classPxShape.html#a74ff05bd6c8ac9db9fcae9be23c08d82">PxShape</a>
 </li>
 <li>PxSimpleTriangleMesh()
 : <a class="el" href="group__geomutils.html#ga9acd5763a76bcf9f41b42c3a30a6421a">PxSimpleTriangleMesh</a>
@@ -707,7 +708,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="structPxSolverBody.html#afa5845fa664df156d2a25300ea00593e">PxSolverBody</a>
 </li>
 <li>PxSphereGeometry()
-: <a class="el" href="classPxSphereGeometry.html#af190d00c7a0b5748e8159eba87cdb23a">PxSphereGeometry</a>
+: <a class="el" href="classPxSphereGeometry.html#ab0c426a8db79da6808e364de8d213c28">PxSphereGeometry</a>
 </li>
 <li>PxSphericalJoint()
 : <a class="el" href="classPxSphericalJoint.html#ab06e4205a73a38b86f803b33596ff2b0">PxSphericalJoint</a>
@@ -719,7 +720,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="structPxStridedData.html#af327f6129229e98cb13b949b3fa2a26d">PxStridedData</a>
 </li>
 <li>PxStrideIterator()
-: <a class="el" href="classPxStrideIterator.html#a83f07f56ecd22231423a7a6115dcec5f">PxStrideIterator&lt; T &gt;</a>
+: <a class="el" href="classPxStrideIterator.html#a47b65f7b217b7b58e41c60eba2036c6a">PxStrideIterator&lt; T &gt;</a>
 </li>
 <li>PxSweepBufferN()
 : <a class="el" href="structPxSweepBufferN.html#ab8be2772b4fe811563efc01824201a61">PxSweepBufferN&lt; N &gt;</a>
@@ -740,13 +741,13 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="group__common.html#gaf2bc9f0c0e1ee44a548900a13bb1136c">PxTolerancesScale</a>
 </li>
 <li>PxTransform()
-: <a class="el" href="classPxTransform.html#acb9832826ac6c8d150200524f7e7324d">PxTransform</a>
+: <a class="el" href="classPxTransform.html#a94418383f43ad184a40806d343858f80">PxTransform</a>
 </li>
 <li>PxTriangle()
-: <a class="el" href="classPxTriangle.html#aaf7b90f828324f6ec1d48ef6b1776728">PxTriangle</a>
+: <a class="el" href="classPxTriangle.html#a4c5c6f1913b2e0c3bb2fc82a5d3a5971">PxTriangle</a>
 </li>
 <li>PxTriangleMesh()
-: <a class="el" href="classPxTriangleMesh.html#aadd4279bc98db8ec4d1da6c1485fcb07">PxTriangleMesh</a>
+: <a class="el" href="classPxTriangleMesh.html#ab6401ba6b8f477e3c95a79da42f4e3b0">PxTriangleMesh</a>
 </li>
 <li>PxTriangleMeshDesc()
 : <a class="el" href="group__cooking.html#ga15d3050ac143dfb67ad43a8682c7b569">PxTriangleMeshDesc</a>
@@ -761,28 +762,28 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="structPxTypedStridedData.html#a00c6a63c63d01bdca92b06f60e6909bb">PxTypedStridedData&lt; TDataType &gt;</a>
 </li>
 <li>PxVec2()
-: <a class="el" href="classPxVec2.html#a96f72810f06447c2ad7a99889d718298">PxVec2</a>
+: <a class="el" href="classPxVec2.html#a4b60f4c08b8019e8c28967ed33d436ce">PxVec2</a>
 </li>
 <li>PxVec3()
-: <a class="el" href="classPxVec3.html#a639685f39203ac441edd2158ef1a1549">PxVec3</a>
+: <a class="el" href="classPxVec3.html#a864f52fe3c9c9f72f5796a7cb64b2628">PxVec3</a>
 </li>
 <li>PxVec4()
-: <a class="el" href="classPxVec4.html#a719a2783335631f273eb46c6f4a61925">PxVec4</a>
+: <a class="el" href="classPxVec4.html#afd19aec6c7311f6e2fd21b13cda157bd">PxVec4</a>
 </li>
 <li>PxVehicleAckermannGeometryData()
-: <a class="el" href="classPxVehicleAckermannGeometryData.html#a6d15ee641598033fef887c399f547422">PxVehicleAckermannGeometryData</a>
+: <a class="el" href="classPxVehicleAckermannGeometryData.html#a67df50348629357c2034e8517817e896">PxVehicleAckermannGeometryData</a>
 </li>
 <li>PxVehicleAntiRollBarData()
 : <a class="el" href="classPxVehicleAntiRollBarData.html#ae926b0985fe5243a1e671405d19e9dac">PxVehicleAntiRollBarData</a>
 </li>
 <li>PxVehicleAutoBoxData()
-: <a class="el" href="classPxVehicleAutoBoxData.html#a4f2313daab10500b71271d46ebbd0fc5">PxVehicleAutoBoxData</a>
+: <a class="el" href="classPxVehicleAutoBoxData.html#a7cb4c73cc1997ba805a79cb72f230d36">PxVehicleAutoBoxData</a>
 </li>
 <li>PxVehicleChassisData()
 : <a class="el" href="classPxVehicleChassisData.html#aedff02e1e05309618636a393743265f4">PxVehicleChassisData</a>
 </li>
 <li>PxVehicleClutchData()
-: <a class="el" href="classPxVehicleClutchData.html#a5f50479c769bfc6ae880b92036abce8d">PxVehicleClutchData</a>
+: <a class="el" href="classPxVehicleClutchData.html#a963c71a977c0bfbb465fd75d1e106dc0">PxVehicleClutchData</a>
 </li>
 <li>PxVehicleConcurrentUpdateData()
 : <a class="el" href="structPxVehicleConcurrentUpdateData.html#a66a66af954cf0dc1f5fc2eec992de647">PxVehicleConcurrentUpdateData</a>
@@ -797,7 +798,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxVehicleDifferential4WData.html#a87b843efc25143193fea2e7112aadc41">PxVehicleDifferential4WData</a>
 </li>
 <li>PxVehicleDifferentialNWData()
-: <a class="el" href="classPxVehicleDifferentialNWData.html#aa363bb07cd4533cfee8a59bc14d03c10">PxVehicleDifferentialNWData</a>
+: <a class="el" href="classPxVehicleDifferentialNWData.html#adf2433f5f82e770290a30dfe1eda3a81">PxVehicleDifferentialNWData</a>
 </li>
 <li>PxVehicleDrivableSurfaceToTireFrictionPairs()
 : <a class="el" href="classPxVehicleDrivableSurfaceToTireFrictionPairs.html#a6838139851fe46e93a55cc3fb4f55239">PxVehicleDrivableSurfaceToTireFrictionPairs</a>
@@ -816,7 +817,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxVehicleDrive4WRawInputData.html#af897772ee3457a2100945890f3d025ef">PxVehicleDrive4WRawInputData</a>
 </li>
 <li>PxVehicleDriveDynData()
-: <a class="el" href="classPxVehicleDriveDynData.html#a5395d429ef9607844d63447c2b89c146">PxVehicleDriveDynData</a>
+: <a class="el" href="classPxVehicleDriveDynData.html#af38233e891d581d123bcae2724d57dc7">PxVehicleDriveDynData</a>
 </li>
 <li>PxVehicleDriveNW()
 : <a class="el" href="classPxVehicleDriveNW.html#a71ec3a0d4cda296f589ea83179dd7a24">PxVehicleDriveNW</a>
@@ -836,7 +837,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxVehicleAckermannGeometryData.html#abe694ed054b9f8747c22bcd3fe251b98">PxVehicleAckermannGeometryData</a>
 , <a class="el" href="classPxVehicleChassisData.html#abe694ed054b9f8747c22bcd3fe251b98">PxVehicleChassisData</a>
 , <a class="el" href="classPxVehicleDifferential4WData.html#abe694ed054b9f8747c22bcd3fe251b98">PxVehicleDifferential4WData</a>
-, <a class="el" href="classPxVehicleDriveSimData4W.html#ab79ed54dbd9ead2aeef21ee3c4e270a9">PxVehicleDriveSimData4W</a>
+, <a class="el" href="classPxVehicleDriveSimData4W.html#adb99a930a7426f4835c48574ee77e947">PxVehicleDriveSimData4W</a>
 </li>
 <li>PxVehicleDriveSimDataNW
 : <a class="el" href="classPxVehicleDifferentialNWData.html#a966a53de4d7949668cafafe064045841">PxVehicleDifferentialNWData</a>
@@ -844,7 +845,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 </li>
 <li>PxVehicleDriveTank
 : <a class="el" href="classPxVehicleDriveSimData.html#a384e496b32aa3c624ceed503417f3595">PxVehicleDriveSimData</a>
-, <a class="el" href="classPxVehicleDriveTank.html#a1d3db837be4782115495f7e5bfcd0a29">PxVehicleDriveTank</a>
+, <a class="el" href="classPxVehicleDriveTank.html#aaf73a5da1f780e4f37b669dfb3e1dd24">PxVehicleDriveTank</a>
 , <a class="el" href="classPxVehicleWheelsDynData.html#a384e496b32aa3c624ceed503417f3595">PxVehicleWheelsDynData</a>
 , <a class="el" href="classPxVehicleWheelsSimData.html#a384e496b32aa3c624ceed503417f3595">PxVehicleWheelsSimData</a>
 </li>
@@ -855,7 +856,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxVehicleEngineData.html#aa24a9523ad4a01e87df3a6d1d7c168a1">PxVehicleEngineData</a>
 </li>
 <li>PxVehicleGearsData()
-: <a class="el" href="classPxVehicleGearsData.html#a197ff974ac25dfbad8c8f323a74a8a0d">PxVehicleGearsData</a>
+: <a class="el" href="classPxVehicleGearsData.html#aaa179d92d69320b45210a1f1bc851c59">PxVehicleGearsData</a>
 </li>
 <li>PxVehicleNoDrive()
 : <a class="el" href="classPxVehicleNoDrive.html#a6ba06186cf9355dd9198beef75352afc">PxVehicleNoDrive</a>
@@ -868,7 +869,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxVehicleTireData.html#a4f7cde0fb3943e130fe35a786f649d1d">PxVehicleTireData</a>
 </li>
 <li>PxVehicleTireLoadFilterData()
-: <a class="el" href="classPxVehicleTireLoadFilterData.html#a4b81e279ec039f3da510ba6939ccdf1b">PxVehicleTireLoadFilterData</a>
+: <a class="el" href="classPxVehicleTireLoadFilterData.html#ab856f54f8e45dc86b9b1a396ccfa05ff">PxVehicleTireLoadFilterData</a>
 </li>
 <li>PxVehicleUpdate
 : <a class="el" href="structPxVehicleConcurrentUpdateData.html#aa960a335429c764ff7e258a0ec3ab5f0">PxVehicleConcurrentUpdateData</a>
@@ -890,7 +891,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classPxVehicleWheelData.html#a0041dd83bf7afcd53489fa1fa9727496">PxVehicleWheelData</a>
 </li>
 <li>PxVehicleWheels()
-: <a class="el" href="classPxVehicleWheels.html#afb89c7e2d53b98c310d59443213fff80">PxVehicleWheels</a>
+: <a class="el" href="classPxVehicleWheels.html#a0a75ee48309f1052d10acdc9b727ab06">PxVehicleWheels</a>
 , <a class="el" href="classPxVehicleWheelsDynData.html#ab770b7d25d1a8fee9b2a5ea18d57f345">PxVehicleWheelsDynData</a>
 , <a class="el" href="classPxVehicleWheelsSimData.html#ab770b7d25d1a8fee9b2a5ea18d57f345">PxVehicleWheelsSimData</a>
 </li>
@@ -905,7 +906,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 <li>PxVehicleWheelsSimData
 : <a class="el" href="classPxVehicleAntiRollBarData.html#acecdfc4ceea29cbb9a3fd946c3848bf4">PxVehicleAntiRollBarData</a>
 , <a class="el" href="classPxVehicleTireLoadFilterData.html#acecdfc4ceea29cbb9a3fd946c3848bf4">PxVehicleTireLoadFilterData</a>
-, <a class="el" href="classPxVehicleWheelsSimData.html#a9fbfa747a9ef9fe0cc1deef89a5b1238">PxVehicleWheelsSimData</a>
+, <a class="el" href="classPxVehicleWheelsSimData.html#a6739c902ff9828abceace89afd02fb81">PxVehicleWheelsSimData</a>
 </li>
 <li>PxWheelQueryResult()
 : <a class="el" href="structPxWheelQueryResult.html#a161b38c47540b17befebc36c02c8e996">PxWheelQueryResult</a>
diff --git a/physx/documentation/PhysXAPI/files/functions_s.html b/physx/documentation/PhysXAPI/files/functions_s.html
index 7e9099d9b..a41fb8073 100644
--- a/physx/documentation/PhysXAPI/files/functions_s.html
+++ b/physx/documentation/PhysXAPI/files/functions_s.html
@@ -220,6 +220,9 @@ <h3><a id="index_s"></a>- s -</h3><ul>
 <li>setAntiRollBarData()
 : <a class="el" href="classPxVehicleWheelsSimData.html#ae8706a8f39a630ba469414fb814fe206">PxVehicleWheelsSimData</a>
 </li>
+<li>setArticulationFlag()
+: <a class="el" href="classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4">PxArticulationReducedCoordinate</a>
+</li>
 <li>setArticulationFlags()
 : <a class="el" href="classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564">PxArticulationReducedCoordinate</a>
 </li>
@@ -539,9 +542,6 @@ <h3><a id="index_s"></a>- s -</h3><ul>
 <li>setJointType()
 : <a class="el" href="classPxArticulationJointReducedCoordinate.html#ad6983182f5ea8c1e043c0f7e1ec3835b">PxArticulationJointReducedCoordinate</a>
 </li>
-<li>setKinematicSurfaceVelocity()
-: <a class="el" href="classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371">PxRigidDynamic</a>
-</li>
 <li>setKinematicTarget()
 : <a class="el" href="classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93">PxRigidDynamic</a>
 </li>
diff --git a/physx/documentation/PhysXAPI/files/functions_vars_h.html b/physx/documentation/PhysXAPI/files/functions_vars_h.html
index 8092836a3..284cffd7b 100644
--- a/physx/documentation/PhysXAPI/files/functions_vars_h.html
+++ b/physx/documentation/PhysXAPI/files/functions_vars_h.html
@@ -111,9 +111,6 @@ <h3><a id="index_h"></a>- h -</h3><ul>
 <li>hasMaxImpulse
 : <a class="el" href="structPxSolverContactDesc.html#ac19bc948958e2f2de3ca214afad28b11">PxSolverContactDesc</a>
 </li>
-<li>hasSurfaceVelocity
-: <a class="el" href="structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f">PxSolverBodyData</a>
-</li>
 <li>heapCapacity
 : <a class="el" href="structPxgDynamicsMemoryConfig.html#a548423b552a06d77a3c0066d768a4dd8">PxgDynamicsMemoryConfig</a>
 </li>
diff --git a/physx/documentation/PhysXAPI/files/functions_vars_p.html b/physx/documentation/PhysXAPI/files/functions_vars_p.html
index 774e0a232..912f39c88 100644
--- a/physx/documentation/PhysXAPI/files/functions_vars_p.html
+++ b/physx/documentation/PhysXAPI/files/functions_vars_p.html
@@ -92,6 +92,7 @@ <h3><a id="index_p"></a>- p -</h3><ul>
 : <a class="el" href="classphysx_1_1Gu_1_1ContactBuffer.html#a4f8fabbb2c9690d8fb30f8b6b1459294">physx::Gu::ContactBuffer</a>
 , <a class="el" href="structPxBatchQueryResult.html#ad0b4a79dc77732dc7bbe5c56f50bd4a6">PxBatchQueryResult&lt; HitType &gt;</a>
 , <a class="el" href="structPxRigidBodyData.html#a1bb93cacad36fcc17492afe83718f8c7">PxRigidBodyData</a>
+, <a class="el" href="structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7">PxSolverBodyData</a>
 , <a class="el" href="structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be">PxSolverContactDesc</a>
 , <a class="el" href="classPxVehicleChassisData.html#a7e37b2a81f834c76e9b00bf9b7358421">PxVehicleChassisData</a>
 </li>
diff --git a/physx/documentation/PhysXAPI/files/group__physics.js b/physx/documentation/PhysXAPI/files/group__physics.js
index 7bc5e6cda..f9de6f88b 100644
--- a/physx/documentation/PhysXAPI/files/group__physics.js
+++ b/physx/documentation/PhysXAPI/files/group__physics.js
@@ -129,10 +129,8 @@ var group__physics =
     ] ],
     [ "PxArticulationJointDriveType", "structPxArticulationJointDriveType.html", [
       [ "Enum", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2", [
-        [ "eNONE", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6", null ],
         [ "eTARGET", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e", null ],
-        [ "eERROR", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372", null ],
-        [ "eTARGETVELOCITY", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e", null ]
+        [ "eERROR", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372", null ]
       ] ]
     ] ],
     [ "PxArticulationAxis", "structPxArticulationAxis.html", [
@@ -318,6 +316,7 @@ var group__physics =
       [ "release", "classPxArticulationReducedCoordinate.html#a15d00fc95ae61c2906977453e7597d9b", null ],
       [ "releaseCache", "classPxArticulationReducedCoordinate.html#ae3b3c188ffd64c4ba7cfd7508efd63bf", null ],
       [ "removeLoopJoint", "classPxArticulationReducedCoordinate.html#a024c2e5000ad1033a868a73b6a025c03", null ],
+      [ "setArticulationFlag", "classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4", null ],
       [ "setArticulationFlags", "classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564", null ],
       [ "teleportRootLink", "classPxArticulationReducedCoordinate.html#a2d067a41dc5bcbbdbf9a298d9064b310", null ],
       [ "unpackJointData", "classPxArticulationReducedCoordinate.html#a10d36a5ac6bb414b2240774cabdc4695", null ],
@@ -856,7 +855,6 @@ var group__physics =
       [ "PxRigidDynamic", "classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064", null ],
       [ "PxRigidDynamic", "classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef", null ],
       [ "~PxRigidDynamic", "classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273", null ],
-      [ "clearKinematicSurfaceVelocity", "classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c", null ],
       [ "getConcreteTypeName", "classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527", null ],
       [ "getContactReportThreshold", "classPxRigidDynamic.html#acc91baffa652090e523df5e6627686ed", null ],
       [ "getKinematicTarget", "classPxRigidDynamic.html#a3b6ed86370cfd04958ca7e1c637e310b", null ],
@@ -869,7 +867,6 @@ var group__physics =
       [ "isSleeping", "classPxRigidDynamic.html#aaf537a84cd9b119ed5f77888c600f2ae", null ],
       [ "putToSleep", "classPxRigidDynamic.html#aae00aa2067a2fe268b999aad04f27c28", null ],
       [ "setContactReportThreshold", "classPxRigidDynamic.html#abfd510964f2287e81fe76a1a9e1725b5", null ],
-      [ "setKinematicSurfaceVelocity", "classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371", null ],
       [ "setKinematicTarget", "classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93", null ],
       [ "setRigidDynamicLockFlag", "classPxRigidDynamic.html#a7cf3e84117da3ec5b499262a3a9f5521", null ],
       [ "setRigidDynamicLockFlags", "classPxRigidDynamic.html#a16f9f0dfeae6e7877bcebca80df42c92", null ],
diff --git a/physx/documentation/PhysXAPI/files/navtreedata.js b/physx/documentation/PhysXAPI/files/navtreedata.js
index 01e612ee6..ec10fbe23 100644
--- a/physx/documentation/PhysXAPI/files/navtreedata.js
+++ b/physx/documentation/PhysXAPI/files/navtreedata.js
@@ -61,27 +61,27 @@ var NAVTREEINDEX =
 "GuContactBuffer_8h.html",
 "PxIO_8h.html",
 "PxWindowsIntrinsics_8h.html#a5d4ea7807df2fcf1968275bc06e79319",
-"classPxArticulationReducedCoordinate.html#afc82b045f2afb76484ea967515c01067",
-"classPxContactSet.html#a6b38910a19843d40816bc7807c482bff",
-"classPxDeserializationContext.html#a5c9fff58c41cd679b10359b836851263",
-"classPxJoint.html#a1da1a62c8aa565a92ff25c83825d1657",
-"classPxObstacleContext.html#aaf653d99de6f092896cc1baa2a4e99c2",
-"classPxRigidBody.html#aafdbdab1865112b15201aeabb23877b4",
-"classPxSerialization.html#a6c64dc9a55f7ae9ffc8c1802855134ed",
-"classPxTriangleMesh.html#a261ef2fa9709a163408b41d2c34dda71",
-"classPxVehicleDrive.html#a255e6465bfe1f298cb22f24ede46869c",
-"classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a981cc327b2fa9709f58cea31945e610c",
-"classPxVehicleWheelsSimData.html#acd51a38c9e5cf37b1c0b5feed7b03c04",
-"globals_func_b.html",
-"group__extensions.html#gaab84d6896f2bf7d17487799f5135cd7c",
+"classPxArticulationReducedCoordinate.html#af8b5217bd48e6d1ccc8ba3e495c66df7",
+"classPxContactSet.html#a6535e80819a6d2a5bbe38db6aa86684e",
+"classPxDeserializationContext.html#a393b7ca347c967d330791d19b2e37006",
+"classPxJoint.html#a1cfe86fe5c8131cea1b9b5ff9df7a014",
+"classPxObstacleContext.html#a9390f860fb489f283c9f872af2777123",
+"classPxRigidBody.html#a9e6f3afd71605e037a5de47955d941e0",
+"classPxSerialization.html#a84d55e6e21727087653a70cc20557765",
+"classPxTriangleMesh.html#a611c8453a6b6256a63f1450aa8cf237a",
+"classPxVehicleDrive.html#a2af80cf3e4d767f1710a8f2e244cb9d5",
+"classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7aa1c63bf522cfa13f31b1432c97ea5cac",
+"classPxVehicleWheelsSimData.html#acefb8071acd7ce982e2eeee33ef758da",
+"globals_func_c.html",
+"group__extensions.html#gaade73cffb9e3e77ae5457c3d393d481e",
 "group__foundation.html#ga7c72d4a195ff9acade3d092124a45c53",
-"group__geomutils.html#ga9bff69ddd57aac22aad966a4b30983e7",
+"group__geomutils.html#gaa4aeb7bd1ccfedd348599061276f6d8e",
 "group__vehicle.html#ga946f4d23cff6fdf36dca694641e20779",
-"structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a91151f48e43ade5a5d7153f1b3160696",
-"structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42affbc9585c9485ff2719bbfdd27df3b90",
-"structPxHitCallback.html#a9c6fc25eb66c22dad2a35d5630d4ec68",
-"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596",
-"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a7e60ef3f98ed6b6a2d76920c75a968c7"
+"structPxConstraintInfo.html#a073f7587ffb496c4bdc470eb34032fd2",
+"structPxConvexMeshCookingType.html#a5a2ae723aca74c185675cd7ba2c9c115a18916173251aae02128c79b69af46906",
+"structPxHitFlag.html",
+"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596a983c62745e575428d8b0aadefc23e8bc",
+"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7aa54dcdaac81056baa6b9257d27df1b83"
 ];
 
 var SYNCONMSG = 'click to disable panel synchronisation';
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex10.js b/physx/documentation/PhysXAPI/files/navtreeindex10.js
index afc48594c..6ccdd8da6 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex10.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex10.js
@@ -1,6 +1,5 @@
 var NAVTREEINDEX10 =
 {
-"classPxTriangleMesh.html#a261ef2fa9709a163408b41d2c34dda71":[2,7,29,13],
 "classPxTriangleMesh.html#a611c8453a6b6256a63f1450aa8cf237a":[2,7,29,5],
 "classPxTriangleMesh.html#a6b36d40812d5cb70507472cd4e147ae0":[2,7,29,11],
 "classPxTriangleMesh.html#a984f1536be90cc0581052c035b4a424d":[2,7,29,3],
@@ -249,5 +248,6 @@ var NAVTREEINDEX10 =
 "classPxVehicleDrivableSurfaceToTireFrictionPairs.html#aecff4e0404b9c0458f701754a9a7e1bb":[2,11,33,3],
 "classPxVehicleDrive.html":[2,11,16],
 "classPxVehicleDrive.html#a02bbb8b1aad17272d533b7b3a41f016f":[2,11,16,12],
-"classPxVehicleDrive.html#a251914d9b502687f2778e3f096dd0578":[2,11,16,7]
+"classPxVehicleDrive.html#a251914d9b502687f2778e3f096dd0578":[2,11,16,7],
+"classPxVehicleDrive.html#a255e6465bfe1f298cb22f24ede46869c":[2,11,16,8]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex11.js b/physx/documentation/PhysXAPI/files/navtreeindex11.js
index 1f7008317..5fc117c8d 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex11.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex11.js
@@ -1,6 +1,5 @@
 var NAVTREEINDEX11 =
 {
-"classPxVehicleDrive.html#a255e6465bfe1f298cb22f24ede46869c":[2,11,16,8],
 "classPxVehicleDrive.html#a2af80cf3e4d767f1710a8f2e244cb9d5":[2,11,16,4],
 "classPxVehicleDrive.html#a2d655bfda0d779fd6ede50fb25b2093e":[2,11,16,14],
 "classPxVehicleDrive.html#a36c21a889ee14e4221a1493a9ea5e9a3":[2,11,16,2],
@@ -249,5 +248,6 @@ var NAVTREEINDEX11 =
 "classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a7a82e798063cbad1d60dfd26b4aea512":[2,11,2,0,19],
 "classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a7ec5ab02dd6f186a8453486e1f1b007d":[2,11,2,0,9],
 "classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a80583db4ae487006afc324f9216023d1":[2,11,2,0,22],
-"classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a8e6929a2b7556b69d30cbbb068c4cfcc":[2,11,2,0,28]
+"classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a8e6929a2b7556b69d30cbbb068c4cfcc":[2,11,2,0,28],
+"classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a981cc327b2fa9709f58cea31945e610c":[2,11,2,0,5]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex12.js b/physx/documentation/PhysXAPI/files/navtreeindex12.js
index c72997915..bd2fac1e7 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex12.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex12.js
@@ -1,6 +1,5 @@
 var NAVTREEINDEX12 =
 {
-"classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a981cc327b2fa9709f58cea31945e610c":[2,11,2,0,5],
 "classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7aa1c63bf522cfa13f31b1432c97ea5cac":[2,11,2,0,12],
 "classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7aa3cb1739bffe703f71b3f7b70fae4187":[2,11,2,0,1],
 "classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7ab75cfff4be6edd9d869f51b1a1fccf0a":[2,11,2,0,23],
@@ -249,5 +248,6 @@ var NAVTREEINDEX12 =
 "classPxVehicleWheelsSimData.html#ac9ab652fed581f0351acb66a769935e2":[2,11,44,45],
 "classPxVehicleWheelsSimData.html#ac9bac9ed3b593410b1dbe391a90f4f6f":[2,11,44,40],
 "classPxVehicleWheelsSimData.html#acbbac4e2b9c31e2955957b5868bd6e8b":[2,11,44,36],
-"classPxVehicleWheelsSimData.html#acbe6956ae50dfadcd9264ed9830da207":[2,11,44,53]
+"classPxVehicleWheelsSimData.html#acbe6956ae50dfadcd9264ed9830da207":[2,11,44,53],
+"classPxVehicleWheelsSimData.html#acd51a38c9e5cf37b1c0b5feed7b03c04":[2,11,44,55]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex13.js b/physx/documentation/PhysXAPI/files/navtreeindex13.js
index a098f8e06..7c5a28e2c 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex13.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex13.js
@@ -1,6 +1,5 @@
 var NAVTREEINDEX13 =
 {
-"classPxVehicleWheelsSimData.html#acd51a38c9e5cf37b1c0b5feed7b03c04":[2,11,44,55],
 "classPxVehicleWheelsSimData.html#acefb8071acd7ce982e2eeee33ef758da":[2,11,44,7],
 "classPxVehicleWheelsSimData.html#acfc01818dd853e393bf3d9ea4370441c":[2,11,44,58],
 "classPxVehicleWheelsSimData.html#ad196c5b249c937b0ff7ccc433dee4dc4":[2,11,44,13],
@@ -249,5 +248,6 @@ var NAVTREEINDEX13 =
 "globals_eval.html":[4,1,5],
 "globals_f.html":[4,1,0,6],
 "globals_func.html":[4,1,1],
-"globals_func.html":[4,1,1,0]
+"globals_func.html":[4,1,1,0],
+"globals_func_b.html":[4,1,1,1]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex14.js b/physx/documentation/PhysXAPI/files/navtreeindex14.js
index 7420f3c7d..3558f112d 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex14.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex14.js
@@ -1,6 +1,5 @@
 var NAVTREEINDEX14 =
 {
-"globals_func_b.html":[4,1,1,1],
 "globals_func_c.html":[4,1,1,2],
 "globals_func_f.html":[4,1,1,3],
 "globals_func_g.html":[4,1,1,4],
@@ -38,16 +37,16 @@ var NAVTREEINDEX14 =
 "group__Serializers.html#gaf191e298bb6922cf8a727058a7980552":[4,0,127,2],
 "group__Serializers.html#gaf191e298bb6922cf8a727058a7980552":[2,6,3],
 "group__character.html":[2,0],
-"group__character.html#ga030da33b9be17c7cd67c6be5904d4860":[2,0,50],
 "group__character.html#ga030da33b9be17c7cd67c6be5904d4860":[2,0,3,2],
+"group__character.html#ga030da33b9be17c7cd67c6be5904d4860":[2,0,50],
 "group__character.html#ga05750d6d68cb164b2de6c80e93ca29d1":[4,0,66,4],
 "group__character.html#ga05750d6d68cb164b2de6c80e93ca29d1":[2,0,56],
 "group__character.html#ga08c251badc674d74a5f9abb07ae5ad8f":[2,0,38],
 "group__character.html#ga08c251badc674d74a5f9abb07ae5ad8f":[2,0,0,3],
 "group__character.html#ga10a3f936afc539f09d7a4f1c8ec53b8f":[4,0,45,0],
 "group__character.html#ga10a3f936afc539f09d7a4f1c8ec53b8f":[2,0,28],
-"group__character.html#ga2404e3aa455180820b7fa4900d75f18e":[2,0,36],
 "group__character.html#ga2404e3aa455180820b7fa4900d75f18e":[4,0,44,0],
+"group__character.html#ga2404e3aa455180820b7fa4900d75f18e":[2,0,36],
 "group__character.html#ga3144636d9207dbed885c9579898987a5":[2,0,52],
 "group__character.html#ga3144636d9207dbed885c9579898987a5":[2,0,17,2],
 "group__character.html#ga3cb7e2aaf5144c3ed9bbf504c2fd66db":[2,0,35],
@@ -58,12 +57,12 @@ var NAVTREEINDEX14 =
 "group__character.html#ga4a13c0d87b6c72ec7bf5e701157c358b":[2,0,53],
 "group__character.html#ga55edbbb581808808c8e160e9608d8790":[2,0,33],
 "group__character.html#ga55edbbb581808808c8e160e9608d8790":[4,0,45,1],
-"group__character.html#ga5d8e29dacee8a0a6543029435c0d5dc6":[2,0,47],
 "group__character.html#ga5d8e29dacee8a0a6543029435c0d5dc6":[2,0,0,0],
+"group__character.html#ga5d8e29dacee8a0a6543029435c0d5dc6":[2,0,47],
 "group__character.html#ga749a16c7bcdb8341d346abe43c023c12":[2,0,51],
 "group__character.html#ga749a16c7bcdb8341d346abe43c023c12":[2,0,17,0],
-"group__character.html#ga81a73ed06438569e730320f322376ae8":[2,0,3,5],
 "group__character.html#ga81a73ed06438569e730320f322376ae8":[2,0,45],
+"group__character.html#ga81a73ed06438569e730320f322376ae8":[2,0,3,5],
 "group__character.html#ga95e4b49e470d870c151c3d727a273676":[2,0,44],
 "group__character.html#ga95e4b49e470d870c151c3d727a273676":[2,0,0,5],
 "group__character.html#ga9a157f74e3a2d7c0756bd527ca69d387":[2,0,49],
@@ -101,18 +100,18 @@ var NAVTREEINDEX14 =
 "group__common.html":[2,1],
 "group__common.html#ga091cebe75e114b79165f382dffe878c8":[2,1,19,1],
 "group__common.html#ga091cebe75e114b79165f382dffe878c8":[2,1,43],
-"group__common.html#ga19403877bf7ce42d7240e4e4c758c016":[4,0,110,4],
 "group__common.html#ga19403877bf7ce42d7240e4e4c758c016":[2,1,40],
-"group__common.html#ga1e3721588799f5fbb18d76e4aab7d018":[2,1,36],
+"group__common.html#ga19403877bf7ce42d7240e4e4c758c016":[4,0,110,4],
 "group__common.html#ga1e3721588799f5fbb18d76e4aab7d018":[4,0,14,0],
-"group__common.html#ga27914dcd6d0448671eebfeca012cd0b5":[4,0,140,4],
+"group__common.html#ga1e3721588799f5fbb18d76e4aab7d018":[2,1,36],
 "group__common.html#ga27914dcd6d0448671eebfeca012cd0b5":[2,1,33],
-"group__common.html#ga2fda99afe264006a8ef8a699aa1328aa":[2,1,30],
+"group__common.html#ga27914dcd6d0448671eebfeca012cd0b5":[4,0,140,4],
 "group__common.html#ga2fda99afe264006a8ef8a699aa1328aa":[4,0,140,1],
+"group__common.html#ga2fda99afe264006a8ef8a699aa1328aa":[2,1,30],
 "group__common.html#ga4636d12a5a01930fa258136f3f93366f":[2,1,28],
 "group__common.html#ga4636d12a5a01930fa258136f3f93366f":[4,0,110,1],
-"group__common.html#ga70054743832670870a1ce9f619907c77":[4,0,140,3],
 "group__common.html#ga70054743832670870a1ce9f619907c77":[2,1,32],
+"group__common.html#ga70054743832670870a1ce9f619907c77":[4,0,140,3],
 "group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4":[2,1,27],
 "group__common.html#ga87ae1d60bdf83754e2fe5065aab40ec4":[4,0,110,0],
 "group__common.html#ga8ca0e9fc5693e347b3ee5735fc637fc2":[2,1,31],
@@ -127,35 +126,35 @@ var NAVTREEINDEX14 =
 "group__common.html#gaa3979f24ebe59e173e262c7155b1a489":[2,1,44],
 "group__common.html#gaa59252c69aa987bd926da9bbf5092367":[4,0,140,5],
 "group__common.html#gaa59252c69aa987bd926da9bbf5092367":[2,1,34],
-"group__common.html#gab2ccfb663643cd2d66b59908189d88cd":[2,1,39],
 "group__common.html#gab2ccfb663643cd2d66b59908189d88cd":[4,0,140,8],
+"group__common.html#gab2ccfb663643cd2d66b59908189d88cd":[2,1,39],
 "group__common.html#gabbba9e3f699e63445326263daec187e8":[2,1,26],
 "group__common.html#gabbba9e3f699e63445326263daec187e8":[4,0,140,0],
 "group__common.html#gabe7c1438b281afb1d13fdc127cc84e68":[2,1,29],
 "group__common.html#gabe7c1438b281afb1d13fdc127cc84e68":[4,0,110,2],
 "group__common.html#gac1fb4b256a5d900d394e89db170a2b79":[2,1,41],
-"group__common.html#gac1fb4b256a5d900d394e89db170a2b79":[4,0,14,1],
 "group__common.html#gac1fb4b256a5d900d394e89db170a2b79":[4,0,140,9],
+"group__common.html#gac1fb4b256a5d900d394e89db170a2b79":[4,0,14,1],
 "group__common.html#gac1fb4b256a5d900d394e89db170a2b79":[2,1,42],
 "group__common.html#gac816bc62a68a52f01bf21f963295e822":[2,1,38],
 "group__common.html#gac816bc62a68a52f01bf21f963295e822":[4,0,110,3],
 "group__common.html#gade7a77d4ae9d14e363364a9d72cf4932":[4,0,98,0],
 "group__common.html#gade7a77d4ae9d14e363364a9d72cf4932":[2,1,45],
-"group__common.html#gaeeccb7f7eed92f30cbbece3c45d2f5dd":[2,1,46],
 "group__common.html#gaeeccb7f7eed92f30cbbece3c45d2f5dd":[4,0,98,1],
+"group__common.html#gaeeccb7f7eed92f30cbbece3c45d2f5dd":[2,1,46],
 "group__common.html#gaf2bc9f0c0e1ee44a548900a13bb1136c":[2,1,19,0],
 "group__common.html#gaf2bc9f0c0e1ee44a548900a13bb1136c":[2,1,47],
 "group__cooking.html":[2,5],
-"group__cooking.html#ga0a0051ee56690c138fac87d780cca9b4":[4,0,50,0],
 "group__cooking.html#ga0a0051ee56690c138fac87d780cca9b4":[2,5,15],
+"group__cooking.html#ga0a0051ee56690c138fac87d780cca9b4":[4,0,50,0],
 "group__cooking.html#ga15d3050ac143dfb67ad43a8682c7b569":[2,5,13,0],
 "group__cooking.html#ga15d3050ac143dfb67ad43a8682c7b569":[2,5,22],
 "group__cooking.html#ga24e3eb40c6f7fbaa1b0435ee158e9f89":[2,5,3,0],
 "group__cooking.html#ga24e3eb40c6f7fbaa1b0435ee158e9f89":[2,5,19],
 "group__cooking.html#ga2b98217f34d8856ebe839821740ac438":[2,5,5,0],
 "group__cooking.html#ga2b98217f34d8856ebe839821740ac438":[2,5,20],
-"group__cooking.html#ga36d3375ae49e62c3842c3fbe0c620651":[4,0,47,0],
 "group__cooking.html#ga36d3375ae49e62c3842c3fbe0c620651":[2,5,14],
+"group__cooking.html#ga36d3375ae49e62c3842c3fbe0c620651":[4,0,47,0],
 "group__cooking.html#ga59f5c9eccaa06ab91f827e10e7a1333a":[2,5,13,1],
 "group__cooking.html#ga59f5c9eccaa06ab91f827e10e7a1333a":[2,5,17],
 "group__cooking.html#ga6155ac1dfe72113670b79dbf363b21d2":[2,5,5,1],
@@ -185,8 +184,8 @@ var NAVTREEINDEX14 =
 "group__extensions.html#ga13ef0a91d385e0f45c1b73b3ffafa8dc":[4,0,60,6],
 "group__extensions.html#ga17f2eab8a94d35778f2555a85c34ea7f":[2,3,93],
 "group__extensions.html#ga17f2eab8a94d35778f2555a85c34ea7f":[4,0,70,0],
-"group__extensions.html#ga29519a6806bb5661fe1d0e0c78389520":[2,3,69],
 "group__extensions.html#ga29519a6806bb5661fe1d0e0c78389520":[4,0,145,1],
+"group__extensions.html#ga29519a6806bb5661fe1d0e0c78389520":[2,3,69],
 "group__extensions.html#ga2ddebd682bdb0bdb1db7ba113a93afe4":[2,3,62],
 "group__extensions.html#ga2ddebd682bdb0bdb1db7ba113a93afe4":[4,0,139,4],
 "group__extensions.html#ga34084817afc2cfd9f1bb9108c5244b08":[2,3,103],
@@ -221,8 +220,8 @@ var NAVTREEINDEX14 =
 "group__extensions.html#ga587ba12f90f77543c3e4452abeb0f22f":[4,0,60,0],
 "group__extensions.html#ga5b375e0249df691cb6c9375fe3ce3eda":[2,3,80],
 "group__extensions.html#ga5b375e0249df691cb6c9375fe3ce3eda":[4,0,145,9],
-"group__extensions.html#ga5d942bf8c3069024e50a8dd2254c390c":[2,3,81],
 "group__extensions.html#ga5d942bf8c3069024e50a8dd2254c390c":[4,0,55,1],
+"group__extensions.html#ga5d942bf8c3069024e50a8dd2254c390c":[2,3,81],
 "group__extensions.html#ga63356d306e103b6bf13dc7bf2ae233f6":[2,3,77],
 "group__extensions.html#ga63356d306e103b6bf13dc7bf2ae233f6":[4,0,145,6],
 "group__extensions.html#ga64679ee1d097cd1699fd8d3413ac2932":[2,3,106],
@@ -249,5 +248,6 @@ var NAVTREEINDEX14 =
 "group__extensions.html#ga8ced9dfabbe8811f9c5ea8933e1298f7":[4,0,60,11],
 "group__extensions.html#ga8e55355887d57086f097b0248b7ea768":[2,3,55],
 "group__extensions.html#ga8e55355887d57086f097b0248b7ea768":[4,0,61,0],
-"group__extensions.html#gaab84d6896f2bf7d17487799f5135cd7c":[2,3,84]
+"group__extensions.html#gaab84d6896f2bf7d17487799f5135cd7c":[2,3,84],
+"group__extensions.html#gaab84d6896f2bf7d17487799f5135cd7c":[4,0,56,2]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex15.js b/physx/documentation/PhysXAPI/files/navtreeindex15.js
index be8a48095..32ffca1c9 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex15.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex15.js
@@ -1,14 +1,13 @@
 var NAVTREEINDEX15 =
 {
-"group__extensions.html#gaab84d6896f2bf7d17487799f5135cd7c":[4,0,56,2],
 "group__extensions.html#gaade73cffb9e3e77ae5457c3d393d481e":[2,3,87],
 "group__extensions.html#gaade73cffb9e3e77ae5457c3d393d481e":[4,0,56,5],
-"group__extensions.html#gab5878e8b784c6c17d26a20bc202c4c0e":[4,0,60,3],
 "group__extensions.html#gab5878e8b784c6c17d26a20bc202c4c0e":[2,3,96],
-"group__extensions.html#gabbffb883dd38c52aef27f7ea084a1cde":[2,3,78],
+"group__extensions.html#gab5878e8b784c6c17d26a20bc202c4c0e":[4,0,60,3],
 "group__extensions.html#gabbffb883dd38c52aef27f7ea084a1cde":[4,0,145,7],
-"group__extensions.html#gac1bd1279e83e31a8b0c8dd0deaa20b79":[4,0,56,4],
+"group__extensions.html#gabbffb883dd38c52aef27f7ea084a1cde":[2,3,78],
 "group__extensions.html#gac1bd1279e83e31a8b0c8dd0deaa20b79":[2,3,86],
+"group__extensions.html#gac1bd1279e83e31a8b0c8dd0deaa20b79":[4,0,56,4],
 "group__extensions.html#gaced14ebcd90bf2d866b38867cde959ae":[2,3,50],
 "group__extensions.html#gaced14ebcd90bf2d866b38867cde959ae":[4,0,141,1],
 "group__extensions.html#gacef7efd5d06512fbf61e9810f46805c1":[2,3,68],
@@ -17,8 +16,8 @@ var NAVTREEINDEX15 =
 "group__extensions.html#gad2b49dd51647b7228668827c689bbb32":[4,0,145,5],
 "group__extensions.html#gad874c3dcee0d1f3ec91e1880a1f09582":[2,3,65],
 "group__extensions.html#gad874c3dcee0d1f3ec91e1880a1f09582":[4,0,57,0],
-"group__extensions.html#gada9542f6f10912eb3f3f89c1ea2f87e6":[4,0,145,2],
 "group__extensions.html#gada9542f6f10912eb3f3f89c1ea2f87e6":[2,3,70],
+"group__extensions.html#gada9542f6f10912eb3f3f89c1ea2f87e6":[4,0,145,2],
 "group__extensions.html#gadabf6cf5b61c51816897119d793d16b7":[2,3,53],
 "group__extensions.html#gadabf6cf5b61c51816897119d793d16b7":[4,0,55,0],
 "group__extensions.html#gadf1d4e04d0c9587b111c3737c9789658":[2,3,72],
@@ -33,14 +32,14 @@ var NAVTREEINDEX15 =
 "group__extensions.html#gaea2576f54650c42280d79ce79ea1979a":[4,0,57,1],
 "group__extensions.html#gaef94272b3a37bbcce4008c1911a06b5c":[2,3,101],
 "group__extensions.html#gaef94272b3a37bbcce4008c1911a06b5c":[4,0,115,1],
-"group__extensions.html#gaf17024df6afd1cbab816914f25b33214":[2,3,104],
 "group__extensions.html#gaf17024df6afd1cbab816914f25b33214":[4,0,60,7],
+"group__extensions.html#gaf17024df6afd1cbab816914f25b33214":[2,3,104],
 "group__extensions.html#gaf61e11222319b3a664d8127c8b64bb98":[2,3,95],
 "group__extensions.html#gaf61e11222319b3a664d8127c8b64bb98":[4,0,60,2],
 "group__extensions.html#gaf683a32f0fc263a83be447fac2879dac":[2,3,56],
 "group__extensions.html#gaf683a32f0fc263a83be447fac2879dac":[4,0,115,0],
-"group__extensions.html#gaf7580a5a315137094cb676a7874c978a":[2,3,91],
 "group__extensions.html#gaf7580a5a315137094cb676a7874c978a":[4,0,63,1],
+"group__extensions.html#gaf7580a5a315137094cb676a7874c978a":[2,3,91],
 "group__extensions.html#gaf9d442ec851f833a9276ddb7ea46dd23":[2,3,59],
 "group__extensions.html#gaf9d442ec851f833a9276ddb7ea46dd23":[4,0,139,1],
 "group__extensions.html#gaf9d523f77e7143a4c7e49424d9f0081e":[2,3,51],
@@ -54,56 +53,56 @@ var NAVTREEINDEX15 =
 "group__foundation.html#ga030b6ea7a6da25c914f73cb33a45a191":[2,4,183],
 "group__foundation.html#ga0326e4ec6e9082ac70f32df5edc6eb5f":[4,0,3,1],
 "group__foundation.html#ga0326e4ec6e9082ac70f32df5edc6eb5f":[2,4,126],
-"group__foundation.html#ga0535d4e635ffe422695adcf66644e2ae":[2,4,7,17],
 "group__foundation.html#ga0535d4e635ffe422695adcf66644e2ae":[2,4,144],
-"group__foundation.html#ga060890e60cdbd06a2819e6d4a1f2d3be":[2,4,135],
+"group__foundation.html#ga0535d4e635ffe422695adcf66644e2ae":[2,4,7,17],
 "group__foundation.html#ga060890e60cdbd06a2819e6d4a1f2d3be":[2,4,7,8],
-"group__foundation.html#ga06a5a07559f3f1444b68770d472f29c9":[2,4,215],
+"group__foundation.html#ga060890e60cdbd06a2819e6d4a1f2d3be":[2,4,135],
 "group__foundation.html#ga06a5a07559f3f1444b68770d472f29c9":[2,4,7,28],
-"group__foundation.html#ga07e921227c6d6fe3c514eae0655c1dff":[2,4,7,20],
+"group__foundation.html#ga06a5a07559f3f1444b68770d472f29c9":[2,4,215],
 "group__foundation.html#ga07e921227c6d6fe3c514eae0655c1dff":[2,4,147],
+"group__foundation.html#ga07e921227c6d6fe3c514eae0655c1dff":[2,4,7,20],
 "group__foundation.html#ga09414aeb1a283b875edb1fa83e551aca":[2,4,33],
 "group__foundation.html#ga09414aeb1a283b875edb1fa83e551aca":[4,0,13,3],
 "group__foundation.html#ga0953977efc7599071aefadc35e842f97":[4,0,97,3],
 "group__foundation.html#ga0953977efc7599071aefadc35e842f97":[2,4,159],
-"group__foundation.html#ga0a511de410e7de7a8625fb572f88bd89":[2,4,107],
 "group__foundation.html#ga0a511de410e7de7a8625fb572f88bd89":[4,0,114,56],
+"group__foundation.html#ga0a511de410e7de7a8625fb572f88bd89":[2,4,107],
 "group__foundation.html#ga0d68edb1f62d21d1570dee43f2d4ebaa":[2,4,7,3],
 "group__foundation.html#ga0d68edb1f62d21d1570dee43f2d4ebaa":[2,4,130],
 "group__foundation.html#ga0e6a6baa9078ed0b45a03fb4404c9bf5":[4,0,147,5],
 "group__foundation.html#ga0e6a6baa9078ed0b45a03fb4404c9bf5":[2,4,65],
-"group__foundation.html#ga0f79a2074a05144416be06f4c45a324c":[4,0,147,2],
 "group__foundation.html#ga0f79a2074a05144416be06f4c45a324c":[2,4,51],
+"group__foundation.html#ga0f79a2074a05144416be06f4c45a324c":[4,0,147,2],
 "group__foundation.html#ga0fb72a858fd527761dbe042cb4adbbe6":[4,0,73,2],
 "group__foundation.html#ga0fb72a858fd527761dbe042cb4adbbe6":[2,4,178],
 "group__foundation.html#ga103f28ff1f24bc29c4167ca49b13c200":[4,0,97,24],
 "group__foundation.html#ga103f28ff1f24bc29c4167ca49b13c200":[2,4,191],
-"group__foundation.html#ga1145ffab2cc9a77ca65b59ba4126342f":[2,4,100],
 "group__foundation.html#ga1145ffab2cc9a77ca65b59ba4126342f":[4,0,114,49],
+"group__foundation.html#ga1145ffab2cc9a77ca65b59ba4126342f":[2,4,100],
 "group__foundation.html#ga11d5e4e77acad728886b380b5f3806f0":[4,0,97,31],
 "group__foundation.html#ga11d5e4e77acad728886b380b5f3806f0":[2,4,204],
-"group__foundation.html#ga125e6c8496174f5aee6b53c2ecd9ba75":[2,4,7,26],
 "group__foundation.html#ga125e6c8496174f5aee6b53c2ecd9ba75":[2,4,213],
+"group__foundation.html#ga125e6c8496174f5aee6b53c2ecd9ba75":[2,4,7,26],
 "group__foundation.html#ga14bf03e830d32c8da6a9a21ad60d0445":[4,0,73,3],
 "group__foundation.html#ga14bf03e830d32c8da6a9a21ad60d0445":[2,4,200],
 "group__foundation.html#ga155d9a6a01e4c17d2ecb801e5c17c17f":[2,4,17,6],
 "group__foundation.html#ga155d9a6a01e4c17d2ecb801e5c17c17f":[2,4,208],
-"group__foundation.html#ga16358e92b0083fddc587efc45050f425":[2,4,138],
 "group__foundation.html#ga16358e92b0083fddc587efc45050f425":[2,4,7,11],
+"group__foundation.html#ga16358e92b0083fddc587efc45050f425":[2,4,138],
 "group__foundation.html#ga17a46314a8577460d3840466cb12a331":[4,0,97,35],
 "group__foundation.html#ga17a46314a8577460d3840466cb12a331":[2,4,218],
-"group__foundation.html#ga19ecbe58dd6039373c15c8452550bd07":[2,4,190],
 "group__foundation.html#ga19ecbe58dd6039373c15c8452550bd07":[4,0,97,23],
-"group__foundation.html#ga1afd8c76d275a787af09ebfceaf7fc30":[2,4,69],
+"group__foundation.html#ga19ecbe58dd6039373c15c8452550bd07":[2,4,190],
 "group__foundation.html#ga1afd8c76d275a787af09ebfceaf7fc30":[4,0,147,9],
-"group__foundation.html#ga2184ad1a4338eacc3e80cfaa1d2024e5":[2,4,176],
+"group__foundation.html#ga1afd8c76d275a787af09ebfceaf7fc30":[2,4,69],
 "group__foundation.html#ga2184ad1a4338eacc3e80cfaa1d2024e5":[4,0,13,4],
-"group__foundation.html#ga226013a71f304c8c670ef781aa62b145":[4,0,147,6],
+"group__foundation.html#ga2184ad1a4338eacc3e80cfaa1d2024e5":[2,4,176],
 "group__foundation.html#ga226013a71f304c8c670ef781aa62b145":[2,4,66],
+"group__foundation.html#ga226013a71f304c8c670ef781aa62b145":[4,0,147,6],
 "group__foundation.html#ga23fec839e8daf10494d43640dca49e09":[4,0,97,32],
 "group__foundation.html#ga23fec839e8daf10494d43640dca49e09":[2,4,205],
-"group__foundation.html#ga2440789bba543f4ab246a7dea35d7d95":[4,0,147,3],
 "group__foundation.html#ga2440789bba543f4ab246a7dea35d7d95":[2,4,52],
+"group__foundation.html#ga2440789bba543f4ab246a7dea35d7d95":[4,0,147,3],
 "group__foundation.html#ga250e461f84db027c14d3f1e33b8adc9c":[4,0,97,28],
 "group__foundation.html#ga250e461f84db027c14d3f1e33b8adc9c":[2,4,201],
 "group__foundation.html#ga2520ecfd21a5e8e15d5ae3d1f9bcfd1c":[4,0,18,1],
@@ -114,56 +113,56 @@ var NAVTREEINDEX15 =
 "group__foundation.html#ga289f38577d111c621dfc38704bf887ac":[2,4,154],
 "group__foundation.html#ga2ac82f7e080e1411a86f98758a0f3875":[4,0,114,41],
 "group__foundation.html#ga2ac82f7e080e1411a86f98758a0f3875":[2,4,87],
-"group__foundation.html#ga2af6846d9d49e433af97059f0291eef1":[2,4,82],
 "group__foundation.html#ga2af6846d9d49e433af97059f0291eef1":[4,0,114,37],
+"group__foundation.html#ga2af6846d9d49e433af97059f0291eef1":[2,4,82],
 "group__foundation.html#ga2b3efd786b831b6cfd30e51dcd3c5ca2":[4,0,114,43],
 "group__foundation.html#ga2b3efd786b831b6cfd30e51dcd3c5ca2":[2,4,89],
-"group__foundation.html#ga2d783f32606b1458efd0fe0ee50c2b03":[2,4,7,4],
 "group__foundation.html#ga2d783f32606b1458efd0fe0ee50c2b03":[2,4,131],
+"group__foundation.html#ga2d783f32606b1458efd0fe0ee50c2b03":[2,4,7,4],
 "group__foundation.html#ga2dd18616fa4eec4df42b5f3505dd2a64":[4,0,114,51],
 "group__foundation.html#ga2dd18616fa4eec4df42b5f3505dd2a64":[2,4,102],
 "group__foundation.html#ga2e96f326a9b29ecb2cbe00f2f7662cf0":[4,0,97,27],
 "group__foundation.html#ga2e96f326a9b29ecb2cbe00f2f7662cf0":[2,4,195],
 "group__foundation.html#ga2f06b47e22c6e93701d766444c67f3a6":[4,0,73,1],
 "group__foundation.html#ga2f06b47e22c6e93701d766444c67f3a6":[2,4,177],
-"group__foundation.html#ga305a9ab26a10d521e58782103206edf5":[4,0,114,42],
 "group__foundation.html#ga305a9ab26a10d521e58782103206edf5":[2,4,88],
-"group__foundation.html#ga315fba4871d4de90d9e36b992b43aaf6":[4,0,13,5],
+"group__foundation.html#ga305a9ab26a10d521e58782103206edf5":[4,0,114,42],
 "group__foundation.html#ga315fba4871d4de90d9e36b992b43aaf6":[2,4,196],
-"group__foundation.html#ga317798739f43b557779c6d40806038b6":[2,4,72],
+"group__foundation.html#ga315fba4871d4de90d9e36b992b43aaf6":[4,0,13,5],
 "group__foundation.html#ga317798739f43b557779c6d40806038b6":[4,0,147,12],
+"group__foundation.html#ga317798739f43b557779c6d40806038b6":[2,4,72],
 "group__foundation.html#ga32315930deaa61e428aa2e0cda454189":[2,4,141],
 "group__foundation.html#ga32315930deaa61e428aa2e0cda454189":[2,4,7,14],
-"group__foundation.html#ga34f70075ac4a52e2d7fb8afc03afc6e2":[2,4,211],
 "group__foundation.html#ga34f70075ac4a52e2d7fb8afc03afc6e2":[2,4,7,24],
-"group__foundation.html#ga3773ce018f51c7f8f2c061faab2d411b":[2,4,224],
+"group__foundation.html#ga34f70075ac4a52e2d7fb8afc03afc6e2":[2,4,211],
 "group__foundation.html#ga3773ce018f51c7f8f2c061faab2d411b":[4,0,97,41],
-"group__foundation.html#ga382d2b60e6161c1dcb3801bbf8e1b4a3":[2,4,37],
+"group__foundation.html#ga3773ce018f51c7f8f2c061faab2d411b":[2,4,224],
 "group__foundation.html#ga382d2b60e6161c1dcb3801bbf8e1b4a3":[4,0,114,11],
+"group__foundation.html#ga382d2b60e6161c1dcb3801bbf8e1b4a3":[2,4,37],
 "group__foundation.html#ga39f6354f5e10029a058e2141956ed9ef":[4,0,114,54],
 "group__foundation.html#ga39f6354f5e10029a058e2141956ed9ef":[2,4,105],
-"group__foundation.html#ga3ad36c60750fd7773dd217aa681ee0d1":[2,4,207],
 "group__foundation.html#ga3ad36c60750fd7773dd217aa681ee0d1":[4,0,97,34],
+"group__foundation.html#ga3ad36c60750fd7773dd217aa681ee0d1":[2,4,207],
 "group__foundation.html#ga3b15bb2f7697b2bed10eb5477909fbe4":[2,4,101],
 "group__foundation.html#ga3b15bb2f7697b2bed10eb5477909fbe4":[4,0,114,50],
 "group__foundation.html#ga3b7937d1226ad04a370386837b538fe5":[4,0,99,2],
 "group__foundation.html#ga3b7937d1226ad04a370386837b538fe5":[2,4,188],
 "group__foundation.html#ga3b820cf45fee058a4083d955daa02154":[4,0,97,9],
 "group__foundation.html#ga3b820cf45fee058a4083d955daa02154":[2,4,165],
-"group__foundation.html#ga3c2a546bd10918cd8cf26d27200081f1":[2,4,7,2],
 "group__foundation.html#ga3c2a546bd10918cd8cf26d27200081f1":[2,4,129],
-"group__foundation.html#ga3fc2611ba27e5701f2a1cf14afd1dd7b":[4,0,147,13],
+"group__foundation.html#ga3c2a546bd10918cd8cf26d27200081f1":[2,4,7,2],
 "group__foundation.html#ga3fc2611ba27e5701f2a1cf14afd1dd7b":[2,4,74],
-"group__foundation.html#ga40e7fc272047a5ab6870668b81aca1a3":[4,0,114,52],
+"group__foundation.html#ga3fc2611ba27e5701f2a1cf14afd1dd7b":[4,0,147,13],
 "group__foundation.html#ga40e7fc272047a5ab6870668b81aca1a3":[2,4,103],
+"group__foundation.html#ga40e7fc272047a5ab6870668b81aca1a3":[4,0,114,52],
 "group__foundation.html#ga41ea3463989964c5a4e21d68a9d3884a":[2,4,80],
 "group__foundation.html#ga41ea3463989964c5a4e21d68a9d3884a":[4,0,114,35],
-"group__foundation.html#ga41f92d88ba8f7b8ef321aa090b0e270d":[4,0,114,63],
 "group__foundation.html#ga41f92d88ba8f7b8ef321aa090b0e270d":[2,4,114],
-"group__foundation.html#ga42b3e630801b256ea6a118cea57757d3":[4,0,114,48],
+"group__foundation.html#ga41f92d88ba8f7b8ef321aa090b0e270d":[4,0,114,63],
 "group__foundation.html#ga42b3e630801b256ea6a118cea57757d3":[2,4,99],
-"group__foundation.html#ga4330ceada999f3f6d10f7bd4b3dd3c2f":[4,0,114,5],
+"group__foundation.html#ga42b3e630801b256ea6a118cea57757d3":[4,0,114,48],
 "group__foundation.html#ga4330ceada999f3f6d10f7bd4b3dd3c2f":[2,4,29],
+"group__foundation.html#ga4330ceada999f3f6d10f7bd4b3dd3c2f":[4,0,114,5],
 "group__foundation.html#ga437cafb349b7bc7d1d912ab27392bfe9":[4,0,97,5],
 "group__foundation.html#ga437cafb349b7bc7d1d912ab27392bfe9":[2,4,161],
 "group__foundation.html#ga43c56deefc64fae5fe077a232a10056c":[2,4,49],
@@ -172,26 +171,26 @@ var NAVTREEINDEX15 =
 "group__foundation.html#ga43eec71a9b57bd336f2208cb33dba42e":[2,4,113],
 "group__foundation.html#ga44a06101ddffac9a2f95183b14d64f99":[2,4,148],
 "group__foundation.html#ga44a06101ddffac9a2f95183b14d64f99":[2,4,7,21],
-"group__foundation.html#ga481340bf67229603a2c5b9257874934a":[4,0,114,29],
 "group__foundation.html#ga481340bf67229603a2c5b9257874934a":[2,4,58],
-"group__foundation.html#ga4a83a657390b324a460f14684e5accee":[4,0,13,2],
+"group__foundation.html#ga481340bf67229603a2c5b9257874934a":[4,0,114,29],
 "group__foundation.html#ga4a83a657390b324a460f14684e5accee":[2,4,32],
+"group__foundation.html#ga4a83a657390b324a460f14684e5accee":[4,0,13,2],
 "group__foundation.html#ga4c63f0632cd5766a4233686020a33159":[4,0,147,17],
 "group__foundation.html#ga4c63f0632cd5766a4233686020a33159":[2,4,78],
-"group__foundation.html#ga4dadf48c1125f80e543dbe94d656d372":[2,4,146],
 "group__foundation.html#ga4dadf48c1125f80e543dbe94d656d372":[2,4,7,19],
-"group__foundation.html#ga4edfc95b25e96e7d308d83347c93a69f":[4,0,109,1],
+"group__foundation.html#ga4dadf48c1125f80e543dbe94d656d372":[2,4,146],
 "group__foundation.html#ga4edfc95b25e96e7d308d83347c93a69f":[2,4,92],
+"group__foundation.html#ga4edfc95b25e96e7d308d83347c93a69f":[4,0,109,1],
 "group__foundation.html#ga4f7abce9f24240386fb46d29e4332d3d":[4,0,114,69],
 "group__foundation.html#ga4f7abce9f24240386fb46d29e4332d3d":[2,4,120],
-"group__foundation.html#ga50d4d7247809d26270bd731bdc394793":[2,4,157],
 "group__foundation.html#ga50d4d7247809d26270bd731bdc394793":[4,0,97,1],
-"group__foundation.html#ga5121e2d88c11a94738ee44b0b6b339a2":[2,4,118],
+"group__foundation.html#ga50d4d7247809d26270bd731bdc394793":[2,4,157],
 "group__foundation.html#ga5121e2d88c11a94738ee44b0b6b339a2":[4,0,114,67],
+"group__foundation.html#ga5121e2d88c11a94738ee44b0b6b339a2":[2,4,118],
 "group__foundation.html#ga5332a7e82c543612744a0c821cf48b78":[2,4,26],
 "group__foundation.html#ga5332a7e82c543612744a0c821cf48b78":[4,0,13,0],
-"group__foundation.html#ga559a6fbf1f502edf3e25e3b90647f111":[4,0,114,68],
 "group__foundation.html#ga559a6fbf1f502edf3e25e3b90647f111":[2,4,119],
+"group__foundation.html#ga559a6fbf1f502edf3e25e3b90647f111":[4,0,114,68],
 "group__foundation.html#ga56576468182972ec8511c7dd20f47a4d":[4,0,97,30],
 "group__foundation.html#ga56576468182972ec8511c7dd20f47a4d":[2,4,203],
 "group__foundation.html#ga56b7f4f37ac451a47f3d7d661069aa49":[4,0,114,38],
@@ -200,34 +199,34 @@ var NAVTREEINDEX15 =
 "group__foundation.html#ga571ac4b7ead96ee2a8ad83d1ebe1aec9":[4,0,114,14],
 "group__foundation.html#ga5875f14ce7524d2a21d3bb23a039d1ec":[2,4,70],
 "group__foundation.html#ga5875f14ce7524d2a21d3bb23a039d1ec":[4,0,147,10],
-"group__foundation.html#ga58dd174408338f37c62dfe6ac1baec60":[4,0,114,55],
 "group__foundation.html#ga58dd174408338f37c62dfe6ac1baec60":[2,4,106],
+"group__foundation.html#ga58dd174408338f37c62dfe6ac1baec60":[4,0,114,55],
 "group__foundation.html#ga5bd19e27244d543c2e0a4d9c04338406":[4,0,147,16],
 "group__foundation.html#ga5bd19e27244d543c2e0a4d9c04338406":[2,4,77],
-"group__foundation.html#ga5bd657340cb0dc207107cb1742d1135b":[2,4,96],
 "group__foundation.html#ga5bd657340cb0dc207107cb1742d1135b":[4,0,114,46],
+"group__foundation.html#ga5bd657340cb0dc207107cb1742d1135b":[2,4,96],
 "group__foundation.html#ga5be743320fae5abf91e51efebced816e":[2,4,45],
 "group__foundation.html#ga5be743320fae5abf91e51efebced816e":[4,0,114,19],
-"group__foundation.html#ga5bf7ba25f65def2a2cdf3e17eac11129":[2,4,197],
 "group__foundation.html#ga5bf7ba25f65def2a2cdf3e17eac11129":[4,0,189,0],
+"group__foundation.html#ga5bf7ba25f65def2a2cdf3e17eac11129":[2,4,197],
 "group__foundation.html#ga5f35262177498e8a11a4f3e5ba83e5e8":[4,0,114,13],
 "group__foundation.html#ga5f35262177498e8a11a4f3e5ba83e5e8":[2,4,39],
-"group__foundation.html#ga5ff30cdf70b6b77fc0a17427ad5ef7de":[4,0,13,1],
 "group__foundation.html#ga5ff30cdf70b6b77fc0a17427ad5ef7de":[2,4,27],
-"group__foundation.html#ga65237bda8f886bb466e89d01a1ba0a77":[4,0,114,39],
+"group__foundation.html#ga5ff30cdf70b6b77fc0a17427ad5ef7de":[4,0,13,1],
 "group__foundation.html#ga65237bda8f886bb466e89d01a1ba0a77":[2,4,85],
-"group__foundation.html#ga65a319adb859941e5bd98575280b8d19":[2,4,35],
+"group__foundation.html#ga65237bda8f886bb466e89d01a1ba0a77":[4,0,114,39],
 "group__foundation.html#ga65a319adb859941e5bd98575280b8d19":[4,0,114,9],
+"group__foundation.html#ga65a319adb859941e5bd98575280b8d19":[2,4,35],
 "group__foundation.html#ga6952091abd80c0c93a99d331b6c97482":[2,4,43],
 "group__foundation.html#ga6952091abd80c0c93a99d331b6c97482":[4,0,114,17],
-"group__foundation.html#ga69c3fa9b2882d545ebdb965086853c62":[4,0,114,57],
 "group__foundation.html#ga69c3fa9b2882d545ebdb965086853c62":[2,4,108],
+"group__foundation.html#ga69c3fa9b2882d545ebdb965086853c62":[4,0,114,57],
 "group__foundation.html#ga6a774eed3cad34b0f636332a3d28c6bb":[2,4,53],
 "group__foundation.html#ga6a774eed3cad34b0f636332a3d28c6bb":[4,0,114,24],
 "group__foundation.html#ga6c2e80e9cab7b72710ac62e537ad60df":[4,0,99,1],
 "group__foundation.html#ga6c2e80e9cab7b72710ac62e537ad60df":[2,4,187],
-"group__foundation.html#ga6e444d58dbf1459261fadc9b79594f63":[2,4,41],
 "group__foundation.html#ga6e444d58dbf1459261fadc9b79594f63":[4,0,114,15],
+"group__foundation.html#ga6e444d58dbf1459261fadc9b79594f63":[2,4,41],
 "group__foundation.html#ga70160ca5bbafef277790b3e0f12baf38":[4,0,3,2],
 "group__foundation.html#ga70160ca5bbafef277790b3e0f12baf38":[2,4,127],
 "group__foundation.html#ga7141493e2ce93a7c29947147874eb372":[2,4,153],
@@ -236,18 +235,19 @@ var NAVTREEINDEX15 =
 "group__foundation.html#ga7160eeaa47f99f991d9b1f113b398143":[4,0,114,3],
 "group__foundation.html#ga732674527c19934c396385ac405993b6":[4,0,114,26],
 "group__foundation.html#ga732674527c19934c396385ac405993b6":[2,4,55],
-"group__foundation.html#ga73b08402e151455258acc7ec628a277b":[4,0,114,32],
 "group__foundation.html#ga73b08402e151455258acc7ec628a277b":[2,4,61],
+"group__foundation.html#ga73b08402e151455258acc7ec628a277b":[4,0,114,32],
 "group__foundation.html#ga75378be4130e1bdda740c22c07314640":[4,0,109,2],
 "group__foundation.html#ga75378be4130e1bdda740c22c07314640":[2,4,93],
 "group__foundation.html#ga7539e08056ea09ff1c341b866b7103f4":[4,0,147,4],
 "group__foundation.html#ga7539e08056ea09ff1c341b866b7103f4":[2,4,64],
 "group__foundation.html#ga79b5721890bfcd38f4890f55ea0e5eac":[2,4,217],
 "group__foundation.html#ga79b5721890bfcd38f4890f55ea0e5eac":[2,4,7,30],
-"group__foundation.html#ga7b6fbb539e9185279ced84864179180b":[4,0,114,16],
 "group__foundation.html#ga7b6fbb539e9185279ced84864179180b":[2,4,42],
+"group__foundation.html#ga7b6fbb539e9185279ced84864179180b":[4,0,114,16],
 "group__foundation.html#ga7b8deaee36a7d45581d4706ae6c9905b":[4,0,97,33],
 "group__foundation.html#ga7b8deaee36a7d45581d4706ae6c9905b":[2,4,206],
 "group__foundation.html#ga7bd33cbf0ea64aee9a79758f70050cdf":[2,4,79],
-"group__foundation.html#ga7bd33cbf0ea64aee9a79758f70050cdf":[4,0,147,18]
+"group__foundation.html#ga7bd33cbf0ea64aee9a79758f70050cdf":[4,0,147,18],
+"group__foundation.html#ga7c72d4a195ff9acade3d092124a45c53":[2,4,216]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex16.js b/physx/documentation/PhysXAPI/files/navtreeindex16.js
index 064d10080..812bbb160 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex16.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex16.js
@@ -1,7 +1,6 @@
 var NAVTREEINDEX16 =
 {
 "group__foundation.html#ga7c72d4a195ff9acade3d092124a45c53":[2,4,7,29],
-"group__foundation.html#ga7c72d4a195ff9acade3d092124a45c53":[2,4,216],
 "group__foundation.html#ga7e2f610707f2d4fd2a7f65c1af4fe022":[2,4,142],
 "group__foundation.html#ga7e2f610707f2d4fd2a7f65c1af4fe022":[2,4,7,15],
 "group__foundation.html#ga7ffc101412f847b7a5d40d9e0c7a0028":[2,4,7,6],
@@ -16,48 +15,48 @@ var NAVTREEINDEX16 =
 "group__foundation.html#ga8853e1b0f46c6dd7043b948747a19a19":[4,0,114,0],
 "group__foundation.html#ga8901e378c5ba8f305e8714d5758cd9dd":[2,4,84],
 "group__foundation.html#ga8901e378c5ba8f305e8714d5758cd9dd":[4,0,147,19],
-"group__foundation.html#ga8c8d56ad8584332dc0d55c7936fb0af3":[4,0,18,0],
 "group__foundation.html#ga8c8d56ad8584332dc0d55c7936fb0af3":[2,4,122],
+"group__foundation.html#ga8c8d56ad8584332dc0d55c7936fb0af3":[4,0,18,0],
 "group__foundation.html#ga8d26d765b23de0bca18dd327cde2d8f8":[2,4,117],
 "group__foundation.html#ga8d26d765b23de0bca18dd327cde2d8f8":[4,0,114,66],
 "group__foundation.html#ga903d59eeddf0328e54f1caf8efddf2fc":[4,0,97,14],
 "group__foundation.html#ga903d59eeddf0328e54f1caf8efddf2fc":[2,4,171],
-"group__foundation.html#ga927c1801f7f1134c1344caaff90bd632":[4,0,114,60],
 "group__foundation.html#ga927c1801f7f1134c1344caaff90bd632":[2,4,111],
-"group__foundation.html#ga939562c42c614caac1aaba467f77d510":[2,4,186],
+"group__foundation.html#ga927c1801f7f1134c1344caaff90bd632":[4,0,114,60],
 "group__foundation.html#ga939562c42c614caac1aaba467f77d510":[4,0,99,0],
+"group__foundation.html#ga939562c42c614caac1aaba467f77d510":[2,4,186],
 "group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a":[2,4,23],
 "group__foundation.html#ga95024dd88a7efd73c060616238ccbe8a":[4,0,114,1],
-"group__foundation.html#ga9564f55a7dca0986938b8a03e78d6f9f":[2,4,169],
 "group__foundation.html#ga9564f55a7dca0986938b8a03e78d6f9f":[4,0,97,12],
-"group__foundation.html#ga95a5fd55fa4db02a70d5f0226098771a":[2,4,139],
+"group__foundation.html#ga9564f55a7dca0986938b8a03e78d6f9f":[2,4,169],
 "group__foundation.html#ga95a5fd55fa4db02a70d5f0226098771a":[2,4,7,12],
-"group__foundation.html#ga966bf639353d2b617e4925513859c1ff":[4,0,114,70],
+"group__foundation.html#ga95a5fd55fa4db02a70d5f0226098771a":[2,4,139],
 "group__foundation.html#ga966bf639353d2b617e4925513859c1ff":[2,4,121],
+"group__foundation.html#ga966bf639353d2b617e4925513859c1ff":[4,0,114,70],
 "group__foundation.html#ga9727af514012bc32f852e20278a7b04e":[4,0,114,72],
 "group__foundation.html#ga9727af514012bc32f852e20278a7b04e":[2,4,155],
 "group__foundation.html#ga98f571cc01fc3aa0e73e215c27ab8069":[4,0,97,29],
 "group__foundation.html#ga98f571cc01fc3aa0e73e215c27ab8069":[2,4,202],
-"group__foundation.html#ga99db209a98702e2d6673d967bac37d07":[4,0,114,7],
 "group__foundation.html#ga99db209a98702e2d6673d967bac37d07":[2,4,31],
+"group__foundation.html#ga99db209a98702e2d6673d967bac37d07":[4,0,114,7],
 "group__foundation.html#ga9aad3b08e5e15565ff39638b753f4a92":[2,4,95],
 "group__foundation.html#ga9aad3b08e5e15565ff39638b753f4a92":[4,0,114,45],
 "group__foundation.html#ga9c45606642752cb98a361b359482c476":[4,0,97,11],
 "group__foundation.html#ga9c45606642752cb98a361b359482c476":[2,4,168],
 "group__foundation.html#ga9cf210fa59d59f641438771f3c24102f":[4,0,97,8],
 "group__foundation.html#ga9cf210fa59d59f641438771f3c24102f":[2,4,164],
-"group__foundation.html#ga9d5b26e75bb13bebe3d08bc49b2212d5":[2,4,136],
 "group__foundation.html#ga9d5b26e75bb13bebe3d08bc49b2212d5":[2,4,7,9],
+"group__foundation.html#ga9d5b26e75bb13bebe3d08bc49b2212d5":[2,4,136],
 "group__foundation.html#ga9deda8eaad9681fc9eb456adc5225668":[4,0,73,0],
 "group__foundation.html#ga9deda8eaad9681fc9eb456adc5225668":[2,4,172],
-"group__foundation.html#ga9e31f59de4ce4f4ad1f1e51c401ac5fd":[2,4,7,25],
 "group__foundation.html#ga9e31f59de4ce4f4ad1f1e51c401ac5fd":[2,4,212],
+"group__foundation.html#ga9e31f59de4ce4f4ad1f1e51c401ac5fd":[2,4,7,25],
 "group__foundation.html#gaa014664f566202f7be1b2caa5625ffd6":[4,0,97,21],
 "group__foundation.html#gaa014664f566202f7be1b2caa5625ffd6":[2,4,184],
-"group__foundation.html#gaa36637513874db07a2b4839703eef2ee":[4,0,18,2],
 "group__foundation.html#gaa36637513874db07a2b4839703eef2ee":[2,4,124],
-"group__foundation.html#gaa6237ff50213fc9872d4569a649dcd53":[2,4,62],
+"group__foundation.html#gaa36637513874db07a2b4839703eef2ee":[4,0,18,2],
 "group__foundation.html#gaa6237ff50213fc9872d4569a649dcd53":[4,0,114,33],
+"group__foundation.html#gaa6237ff50213fc9872d4569a649dcd53":[2,4,62],
 "group__foundation.html#gaa645944027e6f4e405de626910e82d8d":[4,0,114,2],
 "group__foundation.html#gaa645944027e6f4e405de626910e82d8d":[2,4,24],
 "group__foundation.html#gaa689e47bbd54cfc6a4a1ce5bc7f1a2f6":[4,0,97,19],
@@ -66,16 +65,16 @@ var NAVTREEINDEX16 =
 "group__foundation.html#gaa68c4b18475815da696850a588b3ac8b":[4,0,114,34],
 "group__foundation.html#gaa7415102b45174d3eda4d4f2f12127bc":[2,4,149],
 "group__foundation.html#gaa7415102b45174d3eda4d4f2f12127bc":[4,0,170,0],
-"group__foundation.html#gaaa541ee81b5c7958c1367b710cfa99ed":[4,0,114,53],
 "group__foundation.html#gaaa541ee81b5c7958c1367b710cfa99ed":[2,4,104],
+"group__foundation.html#gaaa541ee81b5c7958c1367b710cfa99ed":[4,0,114,53],
 "group__foundation.html#gaacbb9e1f38be71e22df1584a37c56693":[2,4,21],
 "group__foundation.html#gaacbb9e1f38be71e22df1584a37c56693":[4,0,147,0],
-"group__foundation.html#gaace09d9150d8d87671f893afe548ebac":[4,0,114,40],
 "group__foundation.html#gaace09d9150d8d87671f893afe548ebac":[2,4,86],
+"group__foundation.html#gaace09d9150d8d87671f893afe548ebac":[4,0,114,40],
 "group__foundation.html#gaadbc7232bb8117014d359b6e63967170":[2,4,223],
 "group__foundation.html#gaadbc7232bb8117014d359b6e63967170":[4,0,97,40],
-"group__foundation.html#gaaf594b4102e909f7eef735d845735cb7":[4,0,189,1],
 "group__foundation.html#gaaf594b4102e909f7eef735d845735cb7":[2,4,198],
+"group__foundation.html#gaaf594b4102e909f7eef735d845735cb7":[4,0,189,1],
 "group__foundation.html#gab12cc8cbd08c9c19ab0d40b7f789012e":[2,4,38],
 "group__foundation.html#gab12cc8cbd08c9c19ab0d40b7f789012e":[4,0,114,12],
 "group__foundation.html#gab14d7af80f3dc4ff432087381fcefdb9":[2,4,48],
@@ -86,8 +85,8 @@ var NAVTREEINDEX16 =
 "group__foundation.html#gab6253e62a4ebc9be4560c54756aea29a":[2,4,162],
 "group__foundation.html#gab6fc70b79caa835455b6141018a7f9be":[4,0,154,0],
 "group__foundation.html#gab6fc70b79caa835455b6141018a7f9be":[2,4,152],
-"group__foundation.html#gab8347121772157d35897ab50e7aa0674":[2,4,140],
 "group__foundation.html#gab8347121772157d35897ab50e7aa0674":[2,4,7,13],
+"group__foundation.html#gab8347121772157d35897ab50e7aa0674":[2,4,140],
 "group__foundation.html#gab842ec4658d565159e71b949786ddf2e":[2,4,7,5],
 "group__foundation.html#gab842ec4658d565159e71b949786ddf2e":[2,4,132],
 "group__foundation.html#gaba043bbb6945b5a380bd3808badacdfb":[2,4,145],
@@ -96,34 +95,34 @@ var NAVTREEINDEX16 =
 "group__foundation.html#gabb5e8a2eee08bb16174c87dedbe49498":[2,4,173],
 "group__foundation.html#gabcd49a429b8119de54375b3538ae8a8d":[4,0,97,38],
 "group__foundation.html#gabcd49a429b8119de54375b3538ae8a8d":[2,4,221],
-"group__foundation.html#gabe2aa1202597c18f98b9e7cefdf807a8":[2,4,28],
 "group__foundation.html#gabe2aa1202597c18f98b9e7cefdf807a8":[4,0,114,4],
-"group__foundation.html#gabefdbdd6927f15e90168c46824f7bd38":[2,4,98],
+"group__foundation.html#gabe2aa1202597c18f98b9e7cefdf807a8":[2,4,28],
 "group__foundation.html#gabefdbdd6927f15e90168c46824f7bd38":[4,0,114,47],
-"group__foundation.html#gac10bd69c94c165a52f6de39678ef9d4c":[2,4,128],
+"group__foundation.html#gabefdbdd6927f15e90168c46824f7bd38":[2,4,98],
 "group__foundation.html#gac10bd69c94c165a52f6de39678ef9d4c":[4,0,3,3],
+"group__foundation.html#gac10bd69c94c165a52f6de39678ef9d4c":[2,4,128],
 "group__foundation.html#gac1b800617c25ba00871816b8a0756d89":[4,0,19,0],
 "group__foundation.html#gac1b800617c25ba00871816b8a0756d89":[2,4,63],
-"group__foundation.html#gac1ba7551ef78900c6a103ad63fc712ae":[2,4,199],
 "group__foundation.html#gac1ba7551ef78900c6a103ad63fc712ae":[4,0,189,2],
-"group__foundation.html#gac2271cd3b97db6a3e1ba0222d1b4d07e":[4,0,147,20],
+"group__foundation.html#gac1ba7551ef78900c6a103ad63fc712ae":[2,4,199],
 "group__foundation.html#gac2271cd3b97db6a3e1ba0222d1b4d07e":[2,4,97],
+"group__foundation.html#gac2271cd3b97db6a3e1ba0222d1b4d07e":[4,0,147,20],
 "group__foundation.html#gac40d28c5e85426df7a98ff9bd597810c":[4,0,114,58],
 "group__foundation.html#gac40d28c5e85426df7a98ff9bd597810c":[2,4,109],
 "group__foundation.html#gac4b372e9d2d91a0a86d40f7486936993":[2,4,115],
 "group__foundation.html#gac4b372e9d2d91a0a86d40f7486936993":[4,0,114,64],
-"group__foundation.html#gac9e21d1f7a04c78926a89d936c97546f":[2,4,90],
 "group__foundation.html#gac9e21d1f7a04c78926a89d936c97546f":[4,0,114,44],
+"group__foundation.html#gac9e21d1f7a04c78926a89d936c97546f":[2,4,90],
 "group__foundation.html#gaca8cf9822157157ba95923e39999da82":[4,0,147,7],
 "group__foundation.html#gaca8cf9822157157ba95923e39999da82":[2,4,67],
-"group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec":[4,0,114,28],
 "group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec":[2,4,57],
+"group__foundation.html#gacb03347b642a2a5bdea1f9b305a6fbec":[4,0,114,28],
 "group__foundation.html#gacb4cd96e4188d5cc377ab184d5eef081":[4,0,97,39],
 "group__foundation.html#gacb4cd96e4188d5cc377ab184d5eef081":[2,4,222],
-"group__foundation.html#gacc501523ad3ab46c5a061c4442d10fcf":[4,0,97,13],
 "group__foundation.html#gacc501523ad3ab46c5a061c4442d10fcf":[2,4,170],
-"group__foundation.html#gacce5749db3dcfb916e98c253374264ed":[4,0,3,0],
+"group__foundation.html#gacc501523ad3ab46c5a061c4442d10fcf":[4,0,97,13],
 "group__foundation.html#gacce5749db3dcfb916e98c253374264ed":[2,4,125],
+"group__foundation.html#gacce5749db3dcfb916e98c253374264ed":[4,0,3,0],
 "group__foundation.html#gacd512dc7f334b61b26c6ea08fd64cd8b":[2,4,7,1],
 "group__foundation.html#gacd512dc7f334b61b26c6ea08fd64cd8b":[2,4,167],
 "group__foundation.html#gacd945627f345ba03e83a30387ab99dea":[4,0,97,26],
@@ -136,10 +135,10 @@ var NAVTREEINDEX16 =
 "group__foundation.html#gad5970e1d62cfec4e6872517638e7ee61":[2,4,219],
 "group__foundation.html#gad6de0f4af98c5c5280b8fe695dc87318":[4,0,97,18],
 "group__foundation.html#gad6de0f4af98c5c5280b8fe695dc87318":[2,4,179],
-"group__foundation.html#gad76274b09d89544fc4133c8d915293ed":[2,4,60],
 "group__foundation.html#gad76274b09d89544fc4133c8d915293ed":[4,0,114,31],
-"group__foundation.html#gad7d515be586be72c5c485c5624783a97":[2,4,110],
+"group__foundation.html#gad76274b09d89544fc4133c8d915293ed":[2,4,60],
 "group__foundation.html#gad7d515be586be72c5c485c5624783a97":[4,0,114,59],
+"group__foundation.html#gad7d515be586be72c5c485c5624783a97":[2,4,110],
 "group__foundation.html#gadbc9efd74535a6b4f58f28058da1670a":[2,4,7,7],
 "group__foundation.html#gadbc9efd74535a6b4f58f28058da1670a":[2,4,134],
 "group__foundation.html#gaddaa9e455c2d0ae28111358f6c426056":[4,0,97,10],
@@ -152,62 +151,62 @@ var NAVTREEINDEX16 =
 "group__foundation.html#gadff4cde4e8b8443cfbe024b9fb52a891":[4,0,114,10],
 "group__foundation.html#gae0a6cff0842c2d8b1b78656bcd680e1c":[4,0,97,17],
 "group__foundation.html#gae0a6cff0842c2d8b1b78656bcd680e1c":[2,4,175],
-"group__foundation.html#gae34aa1b4d7a85dbe98e9ed2df79c17a0":[4,0,97,25],
 "group__foundation.html#gae34aa1b4d7a85dbe98e9ed2df79c17a0":[2,4,192],
+"group__foundation.html#gae34aa1b4d7a85dbe98e9ed2df79c17a0":[4,0,97,25],
 "group__foundation.html#gae43dc1649f209f1142a57cf14b200ad2":[4,0,97,20],
 "group__foundation.html#gae43dc1649f209f1142a57cf14b200ad2":[2,4,181],
 "group__foundation.html#gae61c2ba4eafd46db3a41bb5d86f3f4a8":[2,4,59],
 "group__foundation.html#gae61c2ba4eafd46db3a41bb5d86f3f4a8":[4,0,114,30],
 "group__foundation.html#gae6e47183b054e4c5d6a478a00fe91e27":[2,4,56],
 "group__foundation.html#gae6e47183b054e4c5d6a478a00fe91e27":[4,0,114,27],
-"group__foundation.html#gae7058022ea016a0af40eb7728a30c928":[2,4,54],
 "group__foundation.html#gae7058022ea016a0af40eb7728a30c928":[4,0,114,25],
-"group__foundation.html#gae776faf7f6d5b480bfff8c2d21b9c4ad":[4,0,171,0],
+"group__foundation.html#gae7058022ea016a0af40eb7728a30c928":[2,4,54],
 "group__foundation.html#gae776faf7f6d5b480bfff8c2d21b9c4ad":[2,4,151],
+"group__foundation.html#gae776faf7f6d5b480bfff8c2d21b9c4ad":[4,0,171,0],
 "group__foundation.html#gae83a283ec08df8515d40c81810628e87":[4,0,114,21],
 "group__foundation.html#gae83a283ec08df8515d40c81810628e87":[2,4,47],
-"group__foundation.html#gae8aa7aabc2f4d376cdac2931bea8014b":[2,4,50],
 "group__foundation.html#gae8aa7aabc2f4d376cdac2931bea8014b":[4,0,147,1],
-"group__foundation.html#gae942e76eff12ad0827a6d3870afb9bc9":[2,4,7,23],
+"group__foundation.html#gae8aa7aabc2f4d376cdac2931bea8014b":[2,4,50],
 "group__foundation.html#gae942e76eff12ad0827a6d3870afb9bc9":[2,4,210],
-"group__foundation.html#gaea189596f11f07c586d4757cc67e8f7f":[2,4,34],
+"group__foundation.html#gae942e76eff12ad0827a6d3870afb9bc9":[2,4,7,23],
 "group__foundation.html#gaea189596f11f07c586d4757cc67e8f7f":[4,0,114,8],
-"group__foundation.html#gaea3adf6a4e304f82c6646618ad633bd2":[2,4,163],
+"group__foundation.html#gaea189596f11f07c586d4757cc67e8f7f":[2,4,34],
 "group__foundation.html#gaea3adf6a4e304f82c6646618ad633bd2":[4,0,97,7],
+"group__foundation.html#gaea3adf6a4e304f82c6646618ad633bd2":[2,4,163],
 "group__foundation.html#gaec15ff1639b763560ddaffc82936ff03":[2,4,112],
 "group__foundation.html#gaec15ff1639b763560ddaffc82936ff03":[4,0,114,61],
 "group__foundation.html#gaee15b10ce840455479f928e5d9f2916b":[4,0,168,0],
 "group__foundation.html#gaee15b10ce840455479f928e5d9f2916b":[2,4,209],
-"group__foundation.html#gaee8f7127123da8f1a6aba82ee0bab638":[4,0,97,37],
 "group__foundation.html#gaee8f7127123da8f1a6aba82ee0bab638":[2,4,220],
+"group__foundation.html#gaee8f7127123da8f1a6aba82ee0bab638":[4,0,97,37],
 "group__foundation.html#gaeedfba7b2ad3d3be35867607faabb11d":[2,4,91],
 "group__foundation.html#gaeedfba7b2ad3d3be35867607faabb11d":[4,0,109,0],
-"group__foundation.html#gaf04275188bc676cf95b678f66f269d38":[2,4,137],
 "group__foundation.html#gaf04275188bc676cf95b678f66f269d38":[2,4,7,10],
+"group__foundation.html#gaf04275188bc676cf95b678f66f269d38":[2,4,137],
 "group__foundation.html#gaf478a0d9ce3c6ec717d8fc74684f6cee":[4,0,97,16],
 "group__foundation.html#gaf478a0d9ce3c6ec717d8fc74684f6cee":[2,4,174],
 "group__foundation.html#gaf4ea3337baa14716f97e5cae7f6047c2":[2,4,15,6],
 "group__foundation.html#gaf4ea3337baa14716f97e5cae7f6047c2":[2,4,193],
-"group__foundation.html#gaf5da2cfad63151151b35afeb43e5b922":[2,4,214],
 "group__foundation.html#gaf5da2cfad63151151b35afeb43e5b922":[2,4,7,27],
+"group__foundation.html#gaf5da2cfad63151151b35afeb43e5b922":[2,4,214],
 "group__foundation.html#gaf72e1dce9e874da7b92cbc0ea5e4e30d":[4,0,97,0],
 "group__foundation.html#gaf72e1dce9e874da7b92cbc0ea5e4e30d":[2,4,156],
-"group__foundation.html#gafb26faf6ab4449f92b90eaedb74d6f3b":[2,4,116],
 "group__foundation.html#gafb26faf6ab4449f92b90eaedb74d6f3b":[4,0,114,65],
+"group__foundation.html#gafb26faf6ab4449f92b90eaedb74d6f3b":[2,4,116],
 "group__foundation.html#gafb73e468103692a6e33a2bb86ef78293":[4,0,99,3],
 "group__foundation.html#gafb73e468103692a6e33a2bb86ef78293":[2,4,189],
 "group__foundation.html#gafce532687809e3b8d5362b34f9b44992":[2,4,143],
 "group__foundation.html#gafce532687809e3b8d5362b34f9b44992":[2,4,7,16],
-"group__foundation.html#gafd3aefa9b2e2bc6856ddff8bc1ff3770":[2,4,68],
 "group__foundation.html#gafd3aefa9b2e2bc6856ddff8bc1ff3770":[4,0,147,8],
+"group__foundation.html#gafd3aefa9b2e2bc6856ddff8bc1ff3770":[2,4,68],
 "group__foundation.html#gafd8f8779793627061774cfcfa1b9fbd7":[2,4,44],
 "group__foundation.html#gafd8f8779793627061774cfcfa1b9fbd7":[4,0,114,18],
 "group__foundation.html#gafed20db31b2806b91c6b6745bac2f9ae":[4,0,97,4],
 "group__foundation.html#gafed20db31b2806b91c6b6745bac2f9ae":[2,4,160],
 "group__foundation.html#gga0326e4ec6e9082ac70f32df5edc6eb5fac1f59d4e552bf7b85ed37efecfd8b8e6":[4,0,3,1,0],
 "group__foundation.html#gga0326e4ec6e9082ac70f32df5edc6eb5fac1f59d4e552bf7b85ed37efecfd8b8e6":[2,4,126,0],
-"group__foundation.html#gga70160ca5bbafef277790b3e0f12baf38a95e14c69af8048aedd218cd6e96ad50f":[2,4,127,0],
 "group__foundation.html#gga70160ca5bbafef277790b3e0f12baf38a95e14c69af8048aedd218cd6e96ad50f":[4,0,3,2,0],
+"group__foundation.html#gga70160ca5bbafef277790b3e0f12baf38a95e14c69af8048aedd218cd6e96ad50f":[2,4,127,0],
 "group__foundation.html#ggac10bd69c94c165a52f6de39678ef9d4ca0ea259a79d3556f2986a96b5fbf8d4bc":[4,0,3,3,0],
 "group__foundation.html#ggac10bd69c94c165a52f6de39678ef9d4ca0ea259a79d3556f2986a96b5fbf8d4bc":[2,4,128,0],
 "group__geomutils.html":[2,7],
@@ -229,25 +228,26 @@ var NAVTREEINDEX16 =
 "group__geomutils.html#ga42446c8ee79edb9cbaa5ecee4fee7d63":[2,7,50],
 "group__geomutils.html#ga43ad91d7d6b8e996338445fa5126bc06":[2,7,17,2],
 "group__geomutils.html#ga43ad91d7d6b8e996338445fa5126bc06":[2,7,48],
-"group__geomutils.html#ga44ff0ebd02fac58691bf604adfc3fc76":[2,7,47],
 "group__geomutils.html#ga44ff0ebd02fac58691bf604adfc3fc76":[2,7,1,3],
+"group__geomutils.html#ga44ff0ebd02fac58691bf604adfc3fc76":[2,7,47],
 "group__geomutils.html#ga5b7658a01ed66c48e8d7967fc99fcb61":[2,7,46],
 "group__geomutils.html#ga5b7658a01ed66c48e8d7967fc99fcb61":[2,7,3,2],
 "group__geomutils.html#ga5be09a494ce1f8aa8e8b986e82fc06d8":[4,0,101,1],
 "group__geomutils.html#ga5be09a494ce1f8aa8e8b986e82fc06d8":[2,7,36],
 "group__geomutils.html#ga60805c0463b00baf463d5281e034a141":[4,0,30,0],
 "group__geomutils.html#ga60805c0463b00baf463d5281e034a141":[2,7,58],
-"group__geomutils.html#ga657370d4666df870740a609fc2037f05":[4,0,75,9],
 "group__geomutils.html#ga657370d4666df870740a609fc2037f05":[2,7,53],
-"group__geomutils.html#ga8024f8608f335fb039e332a3e5afbe89":[4,0,113,0],
+"group__geomutils.html#ga657370d4666df870740a609fc2037f05":[4,0,75,9],
 "group__geomutils.html#ga8024f8608f335fb039e332a3e5afbe89":[2,7,55],
-"group__geomutils.html#ga880239cc33c00344cee0a4e32c573561":[2,7,22,1],
+"group__geomutils.html#ga8024f8608f335fb039e332a3e5afbe89":[4,0,113,0],
 "group__geomutils.html#ga880239cc33c00344cee0a4e32c573561":[2,7,45],
+"group__geomutils.html#ga880239cc33c00344cee0a4e32c573561":[2,7,22,1],
 "group__geomutils.html#ga913cfc3a0f052fb2ea2818e1a31131c2":[4,0,49,0],
 "group__geomutils.html#ga913cfc3a0f052fb2ea2818e1a31131c2":[2,7,39],
 "group__geomutils.html#ga93b547b1968a59b2d23c91aab533e2c8":[2,7,40],
 "group__geomutils.html#ga93b547b1968a59b2d23c91aab533e2c8":[4,0,84,0],
 "group__geomutils.html#ga9acd5763a76bcf9f41b42c3a30a6421a":[2,7,56],
 "group__geomutils.html#ga9acd5763a76bcf9f41b42c3a30a6421a":[2,7,24,0],
+"group__geomutils.html#ga9bff69ddd57aac22aad966a4b30983e7":[4,0,163,1],
 "group__geomutils.html#ga9bff69ddd57aac22aad966a4b30983e7":[2,7,43]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex17.js b/physx/documentation/PhysXAPI/files/navtreeindex17.js
index e454c535a..86e0868bb 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex17.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex17.js
@@ -1,6 +1,5 @@
 var NAVTREEINDEX17 =
 {
-"group__geomutils.html#ga9bff69ddd57aac22aad966a4b30983e7":[4,0,163,1],
 "group__geomutils.html#gaa4aeb7bd1ccfedd348599061276f6d8e":[2,7,44],
 "group__geomutils.html#gaa4aeb7bd1ccfedd348599061276f6d8e":[2,7,25,2],
 "group__geomutils.html#gaaaa6a4b8b94e1bcce0bedb92d873b57e":[4,0,101,0],
@@ -13,11 +12,11 @@ var NAVTREEINDEX17 =
 "group__geomutils.html#gad7702c73625ee858ad6458b97856d475":[4,0,113,1],
 "group__geomutils.html#gadd7ec6c255262ddceeb7e04600bc47b5":[2,7,13,2],
 "group__geomutils.html#gadd7ec6c255262ddceeb7e04600bc47b5":[2,7,60],
-"group__geomutils.html#gaec23462a84179db8a8ea77e5115d5fb1":[2,7,41],
 "group__geomutils.html#gaec23462a84179db8a8ea77e5115d5fb1":[4,0,146,0],
+"group__geomutils.html#gaec23462a84179db8a8ea77e5115d5fb1":[2,7,41],
 "group__immediatemode.html":[2,10],
-"group__immediatemode.html#ga06ca05e94083ad8df2c5377cf692b4bf":[2,10,2],
 "group__immediatemode.html#ga06ca05e94083ad8df2c5377cf692b4bf":[4,0,87,0],
+"group__immediatemode.html#ga06ca05e94083ad8df2c5377cf692b4bf":[2,10,2],
 "group__immediatemode.html#ga3dc6c7ca660d5352b3da3198ab5036c8":[4,0,87,3],
 "group__immediatemode.html#ga3dc6c7ca660d5352b3da3198ab5036c8":[2,10,5],
 "group__immediatemode.html#ga43ee6393438e91aa05bb2790559104b7":[4,0,87,6],
@@ -39,12 +38,12 @@ var NAVTREEINDEX17 =
 "group__physics.html#ga032d753d48dd9e14306d309c2b618410":[2,2,103],
 "group__physics.html#ga03a3549588909e6ca0a491af76146757":[4,0,102,7],
 "group__physics.html#ga03a3549588909e6ca0a491af76146757":[2,2,110],
-"group__physics.html#ga0f17b2c81d2d85ff9ff7f853464c38e7":[4,0,102,13],
 "group__physics.html#ga0f17b2c81d2d85ff9ff7f853464c38e7":[2,2,116],
-"group__physics.html#ga14481630a9253df1784a9cad4e94d904":[4,0,102,8],
+"group__physics.html#ga0f17b2c81d2d85ff9ff7f853464c38e7":[4,0,102,13],
 "group__physics.html#ga14481630a9253df1784a9cad4e94d904":[2,2,111],
-"group__physics.html#ga1619e335e650ffbd52cd4961bc0dea9a":[2,2,123],
+"group__physics.html#ga14481630a9253df1784a9cad4e94d904":[4,0,102,8],
 "group__physics.html#ga1619e335e650ffbd52cd4961bc0dea9a":[4,0,12,0],
+"group__physics.html#ga1619e335e650ffbd52cd4961bc0dea9a":[2,2,123],
 "group__physics.html#ga18053d8127ddb5ed5609e4c748b6ad0d":[4,0,143,0],
 "group__physics.html#ga18053d8127ddb5ed5609e4c748b6ad0d":[2,2,145],
 "group__physics.html#ga190471540583d37595e5532c7534f950":[4,0,102,11],
@@ -57,8 +56,8 @@ var NAVTREEINDEX17 =
 "group__physics.html#ga2ab6803c803ae6372791ff060068c6a1":[2,2,142],
 "group__physics.html#ga2cc87661da498e4f0c4ce26b6656e467":[4,0,102,4],
 "group__physics.html#ga2cc87661da498e4f0c4ce26b6656e467":[2,2,107],
-"group__physics.html#ga314967fa3f63ec7c0b84832edb318458":[4,0,102,14],
 "group__physics.html#ga314967fa3f63ec7c0b84832edb318458":[2,2,117],
+"group__physics.html#ga314967fa3f63ec7c0b84832edb318458":[4,0,102,14],
 "group__physics.html#ga355f2bbe3ef578293e64865eb7986752":[4,0,37,1],
 "group__physics.html#ga355f2bbe3ef578293e64865eb7986752":[2,2,127],
 "group__physics.html#ga39043f3971c7c14f163a41f41982701f":[4,0,102,5],
@@ -75,44 +74,44 @@ var NAVTREEINDEX17 =
 "group__physics.html#ga4d1a71cd5d63cff2bbd97d8bc8899708":[2,2,128],
 "group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405":[4,0,9,0],
 "group__physics.html#ga4d2907efdef5e5b7b1becd1ab59c2405":[2,2,125],
-"group__physics.html#ga54a626a9a6d80543048bffc654814704":[2,2,144],
 "group__physics.html#ga54a626a9a6d80543048bffc654814704":[4,0,137,0],
+"group__physics.html#ga54a626a9a6d80543048bffc654814704":[2,2,144],
 "group__physics.html#ga5605c240ca14c7fbfabf545edb73bcdc":[4,0,102,6],
 "group__physics.html#ga5605c240ca14c7fbfabf545edb73bcdc":[2,2,109],
-"group__physics.html#ga59c318a035d7a555f3e2188d757059bf":[2,2,118],
 "group__physics.html#ga59c318a035d7a555f3e2188d757059bf":[4,0,102,15],
+"group__physics.html#ga59c318a035d7a555f3e2188d757059bf":[2,2,118],
 "group__physics.html#ga5c5117095910154a5e39fee736503e72":[4,0,102,10],
 "group__physics.html#ga5c5117095910154a5e39fee736503e72":[2,2,113],
 "group__physics.html#ga5d8a1dc3627cf1442f40d91a5ec6b4e3":[4,0,16,1],
 "group__physics.html#ga5d8a1dc3627cf1442f40d91a5ec6b4e3":[2,2,141],
-"group__physics.html#ga622bf3792efe0007282d836719e80093":[2,2,153],
 "group__physics.html#ga622bf3792efe0007282d836719e80093":[2,2,79,1],
-"group__physics.html#ga675117fb97324a28d3b982b47430ea02":[2,2,136],
+"group__physics.html#ga622bf3792efe0007282d836719e80093":[2,2,153],
 "group__physics.html#ga675117fb97324a28d3b982b47430ea02":[4,0,69,0],
+"group__physics.html#ga675117fb97324a28d3b982b47430ea02":[2,2,136],
 "group__physics.html#ga737af8e8faa77ff454b310957b681ad0":[2,2,96,1],
 "group__physics.html#ga737af8e8faa77ff454b310957b681ad0":[2,2,149],
-"group__physics.html#ga7b0c5783657e45e3fd752adfe3c1d069":[4,0,69,1],
 "group__physics.html#ga7b0c5783657e45e3fd752adfe3c1d069":[2,2,137],
+"group__physics.html#ga7b0c5783657e45e3fd752adfe3c1d069":[4,0,69,1],
 "group__physics.html#ga7c50eeff3e586897d4441a18e43b7237":[2,2,26,0],
 "group__physics.html#ga7c50eeff3e586897d4441a18e43b7237":[2,2,156],
 "group__physics.html#ga7d5660bc1b17b72651434edf7fa022a8":[4,0,102,1],
 "group__physics.html#ga7d5660bc1b17b72651434edf7fa022a8":[2,2,104],
 "group__physics.html#ga7f1b415cf346b53f1eb533e68f22ddb0":[4,0,37,4],
 "group__physics.html#ga7f1b415cf346b53f1eb533e68f22ddb0":[2,2,155],
-"group__physics.html#ga7faafa414d3b4a468091cdaf785a931e":[2,2,81,1],
 "group__physics.html#ga7faafa414d3b4a468091cdaf785a931e":[2,2,154],
-"group__physics.html#ga8039bfec65da68b2294a97175ddb2c66":[4,0,105,1],
+"group__physics.html#ga7faafa414d3b4a468091cdaf785a931e":[2,2,81,1],
 "group__physics.html#ga8039bfec65da68b2294a97175ddb2c66":[2,2,158],
+"group__physics.html#ga8039bfec65da68b2294a97175ddb2c66":[4,0,105,1],
 "group__physics.html#ga824f991be62d7c28eadf32316562408f":[4,0,105,2],
 "group__physics.html#ga824f991be62d7c28eadf32316562408f":[2,2,162],
-"group__physics.html#ga89ec6835295298336ceaca7069ba96e4":[4,0,16,2],
 "group__physics.html#ga89ec6835295298336ceaca7069ba96e4":[2,2,147],
+"group__physics.html#ga89ec6835295298336ceaca7069ba96e4":[4,0,16,2],
 "group__physics.html#ga8a5eb66875b992351ed7038b8001bbea":[4,0,102,2],
 "group__physics.html#ga8a5eb66875b992351ed7038b8001bbea":[2,2,105],
 "group__physics.html#ga8c083fd86a8c52ff269aa4dd3407127b":[2,2,166],
 "group__physics.html#ga8c083fd86a8c52ff269aa4dd3407127b":[2,2,81,0],
-"group__physics.html#ga90395861aa5abb531d3dd02790bc2b18":[4,0,92,0],
 "group__physics.html#ga90395861aa5abb531d3dd02790bc2b18":[2,2,132],
+"group__physics.html#ga90395861aa5abb531d3dd02790bc2b18":[4,0,92,0],
 "group__physics.html#ga90b8a1ec43be7a15eb0efd30404a8f72":[2,2,26,1],
 "group__physics.html#ga90b8a1ec43be7a15eb0efd30404a8f72":[2,2,152],
 "group__physics.html#ga92144d0904f8a696f85737c576b88d9f":[2,2,163],
@@ -123,16 +122,16 @@ var NAVTREEINDEX17 =
 "group__physics.html#gaa4275c674f70ce3f0be586339eb0b045":[2,2,131],
 "group__physics.html#gaa7375184ba494172fa7677dae44bd9a8":[2,2,169],
 "group__physics.html#gaa7375184ba494172fa7677dae44bd9a8":[2,2,81,2],
-"group__physics.html#gaa85dd505b96db854d4ffdbe24fcb9089":[4,0,37,3],
 "group__physics.html#gaa85dd505b96db854d4ffdbe24fcb9089":[2,2,129],
+"group__physics.html#gaa85dd505b96db854d4ffdbe24fcb9089":[4,0,37,3],
 "group__physics.html#gaacd9cb0f0e89fbbc09fec759b254d109":[4,0,96,0],
 "group__physics.html#gaacd9cb0f0e89fbbc09fec759b254d109":[2,2,138],
-"group__physics.html#gab35e15599f740af43f399ed4ad9f04e8":[2,2,143],
 "group__physics.html#gab35e15599f740af43f399ed4ad9f04e8":[4,0,134,0],
-"group__physics.html#gab6d54df954c875ccacf8bddba593cf83":[2,2,150],
+"group__physics.html#gab35e15599f740af43f399ed4ad9f04e8":[2,2,143],
 "group__physics.html#gab6d54df954c875ccacf8bddba593cf83":[2,2,96,2],
-"group__physics.html#gabd83d1895d4f68e1e3e0f406c4c8ce65":[4,0,69,5],
+"group__physics.html#gab6d54df954c875ccacf8bddba593cf83":[2,2,150],
 "group__physics.html#gabd83d1895d4f68e1e3e0f406c4c8ce65":[2,2,160],
+"group__physics.html#gabd83d1895d4f68e1e3e0f406c4c8ce65":[4,0,69,5],
 "group__physics.html#gacacbeccf757e60dbf45089ef382681d9":[2,2,140],
 "group__physics.html#gacacbeccf757e60dbf45089ef382681d9":[4,0,69,2],
 "group__physics.html#gad00ce21229da4335d2cbdee921317aa6":[4,0,37,19],
@@ -141,38 +140,38 @@ var NAVTREEINDEX17 =
 "group__physics.html#gad36024f1760b55b0947dc0d91e080bd7":[2,2,79,0],
 "group__physics.html#gad42d1fc82f3ab8db230e5050f2fe0cd0":[2,2,112],
 "group__physics.html#gad42d1fc82f3ab8db230e5050f2fe0cd0":[4,0,102,9],
-"group__physics.html#gad63c429157f9c8c87fcd0ea2f9b79f66":[2,2,157],
 "group__physics.html#gad63c429157f9c8c87fcd0ea2f9b79f66":[4,0,105,0],
-"group__physics.html#gadf1f469d767ae64422c20cce7c71a6b3":[2,2,115],
+"group__physics.html#gad63c429157f9c8c87fcd0ea2f9b79f66":[2,2,157],
 "group__physics.html#gadf1f469d767ae64422c20cce7c71a6b3":[4,0,102,12],
-"group__physics.html#gae0a66b10a979f7449050bc97669530b2":[2,2,139],
+"group__physics.html#gadf1f469d767ae64422c20cce7c71a6b3":[2,2,115],
 "group__physics.html#gae0a66b10a979f7449050bc97669530b2":[4,0,16,0],
+"group__physics.html#gae0a66b10a979f7449050bc97669530b2":[2,2,139],
 "group__physics.html#gae3a84455caaa6a7de67513ea29a315e1":[4,0,105,5],
 "group__physics.html#gae3a84455caaa6a7de67513ea29a315e1":[2,2,165],
 "group__physics.html#gae42707bb37470210b586a341ff585b78":[4,0,102,3],
 "group__physics.html#gae42707bb37470210b586a341ff585b78":[2,2,106],
 "group__physics.html#gae5d4ef1b3321702c4431da8fe72b8f7f":[4,0,148,2],
 "group__physics.html#gae5d4ef1b3321702c4431da8fe72b8f7f":[2,2,148],
-"group__physics.html#gae673c050f15b0fc912860adbb3394e34":[2,2,79,2],
 "group__physics.html#gae673c050f15b0fc912860adbb3394e34":[2,2,168],
+"group__physics.html#gae673c050f15b0fc912860adbb3394e34":[2,2,79,2],
 "group__physics.html#gae73039b8bd7e7f4a606acfba4811291c":[4,0,148,0],
 "group__physics.html#gae73039b8bd7e7f4a606acfba4811291c":[2,2,130],
 "group__physics.html#gae76639de8e4ab9d5e155f72b394812d2":[4,0,136,0],
 "group__physics.html#gae76639de8e4ab9d5e155f72b394812d2":[2,2,122],
 "group__physics.html#gaec480d86354043f6c7cb13b237914e7f":[2,2,170],
 "group__physics.html#gaec480d86354043f6c7cb13b237914e7f":[4,0,69,7],
-"group__physics.html#gaf2b7f07d3fa7fafaf3d85114fbb120ee":[4,0,36,0],
 "group__physics.html#gaf2b7f07d3fa7fafaf3d85114fbb120ee":[2,2,126],
+"group__physics.html#gaf2b7f07d3fa7fafaf3d85114fbb120ee":[4,0,36,0],
+"group__physics.html#gaf665154f3f66f7c4f65ca9015db8ee87":[4,0,4,1],
 "group__physics.html#gaf665154f3f66f7c4f65ca9015db8ee87":[4,0,136,1],
 "group__physics.html#gaf665154f3f66f7c4f65ca9015db8ee87":[2,2,134],
 "group__physics.html#gaf665154f3f66f7c4f65ca9015db8ee87":[2,2,135],
-"group__physics.html#gaf665154f3f66f7c4f65ca9015db8ee87":[4,0,4,1],
 "group__physics.html#gafa22d81f1318ef3c37f1d746ac686c22":[2,2,164],
 "group__physics.html#gafa22d81f1318ef3c37f1d746ac686c22":[4,0,105,4],
 "group__physics.html#gafb4c5337ad84e1f9eeaa8d3d33caa819":[4,0,62,0],
 "group__physics.html#gafb4c5337ad84e1f9eeaa8d3d33caa819":[2,2,133],
-"group__physics.html#gaffbcca4cdc677e1e2fe5cf379146727e":[4,0,102,16],
 "group__physics.html#gaffbcca4cdc677e1e2fe5cf379146727e":[2,2,119],
+"group__physics.html#gaffbcca4cdc677e1e2fe5cf379146727e":[4,0,102,16],
 "group__pvd.html":[2,8],
 "group__pvd.html#ga12ad7e982ca0e3d97f15c6bac4c479a2":[4,0,121,0],
 "group__pvd.html#ga12ad7e982ca0e3d97f15c6bac4c479a2":[2,8,9],
@@ -180,8 +179,8 @@ var NAVTREEINDEX17 =
 "group__pvd.html#ga23ddab69994886fb588a139635bff64b":[2,8,7],
 "group__pvd.html#ga4e8b7695b7145523688243b39da116c9":[4,0,119,0],
 "group__pvd.html#ga4e8b7695b7145523688243b39da116c9":[2,8,6],
-"group__pvd.html#gaba53e1ef15e10cade433600f8f7954c4":[2,8,10],
 "group__pvd.html#gaba53e1ef15e10cade433600f8f7954c4":[4,0,121,1],
+"group__pvd.html#gaba53e1ef15e10cade433600f8f7954c4":[2,8,10],
 "group__pvd.html#gaf0cb52973319101ff0f34f985c5093e7":[2,8,8],
 "group__pvd.html#gaf0cb52973319101ff0f34f985c5093e7":[4,0,119,1],
 "group__scenequery.html":[2,9],
@@ -199,27 +198,27 @@ var NAVTREEINDEX17 =
 "group__scenequery.html#ga8cbf76701572506b8120b8e54ac319c7":[2,9,18],
 "group__scenequery.html#ga9baa3a8cad31bbb8ef666fe01b3afeb7":[4,0,124,2],
 "group__scenequery.html#ga9baa3a8cad31bbb8ef666fe01b3afeb7":[2,9,21],
-"group__scenequery.html#gaa231e804efb78bc7d3405c9e4bd31502":[2,9,28],
 "group__scenequery.html#gaa231e804efb78bc7d3405c9e4bd31502":[4,0,123,4],
-"group__scenequery.html#gaacf083b040f6c841644782282171bd61":[4,0,123,2],
+"group__scenequery.html#gaa231e804efb78bc7d3405c9e4bd31502":[2,9,28],
 "group__scenequery.html#gaacf083b040f6c841644782282171bd61":[2,9,22],
+"group__scenequery.html#gaacf083b040f6c841644782282171bd61":[4,0,123,2],
 "group__scenequery.html#gab21a68ce9e5a18aa742111920b75a84c":[4,0,124,3],
 "group__scenequery.html#gab21a68ce9e5a18aa742111920b75a84c":[2,9,23],
-"group__scenequery.html#gab2564436bacfef3130eb9f0c9070656e":[2,9,19],
 "group__scenequery.html#gab2564436bacfef3130eb9f0c9070656e":[4,0,123,1],
+"group__scenequery.html#gab2564436bacfef3130eb9f0c9070656e":[2,9,19],
 "group__scenequery.html#gaecfed2b83a930b922b3d95e22ff30665":[2,9,25],
 "group__scenequery.html#gaecfed2b83a930b922b3d95e22ff30665":[4,0,124,5],
 "group__vehicle.html":[2,11],
-"group__vehicle.html#ga0c3a7dea79cf9968bd350a49ce0fd770":[2,11,47],
 "group__vehicle.html#ga0c3a7dea79cf9968bd350a49ce0fd770":[4,0,179,0],
+"group__vehicle.html#ga0c3a7dea79cf9968bd350a49ce0fd770":[2,11,47],
 "group__vehicle.html#ga1acc47dad2914768ee6074320b0a5643":[2,11,64],
 "group__vehicle.html#ga1acc47dad2914768ee6074320b0a5643":[4,0,182,0],
-"group__vehicle.html#ga2020b9fcb5092e2a2d81e82ba7461dfd":[4,0,182,3],
 "group__vehicle.html#ga2020b9fcb5092e2a2d81e82ba7461dfd":[2,11,71],
+"group__vehicle.html#ga2020b9fcb5092e2a2d81e82ba7461dfd":[4,0,182,3],
 "group__vehicle.html#ga2b7b927dd3d949d50a3f42c695264f43":[2,11,49],
 "group__vehicle.html#ga2b7b927dd3d949d50a3f42c695264f43":[4,0,180,0],
-"group__vehicle.html#ga2b7e00f8931f3b84d3aa6510d7a6cfaa":[4,0,184,5],
 "group__vehicle.html#ga2b7e00f8931f3b84d3aa6510d7a6cfaa":[2,11,61],
+"group__vehicle.html#ga2b7e00f8931f3b84d3aa6510d7a6cfaa":[4,0,184,5],
 "group__vehicle.html#ga2da633b926f231e27c27dcb877535bc3":[2,11,59],
 "group__vehicle.html#ga2da633b926f231e27c27dcb877535bc3":[4,0,184,3],
 "group__vehicle.html#ga34079f5978ce1dfcf2a991c0458108f2":[4,0,185,0],
@@ -230,24 +229,25 @@ var NAVTREEINDEX17 =
 "group__vehicle.html#ga3c9f3da897c9294c9164aa41b18150e3":[2,11,72],
 "group__vehicle.html#ga47aff43683966ca9d1118a1bf4a1f5c2":[4,0,182,5],
 "group__vehicle.html#ga47aff43683966ca9d1118a1bf4a1f5c2":[2,11,74],
-"group__vehicle.html#ga4eb698c19004af6a07c2b2a666f0c1ce":[2,11,67],
 "group__vehicle.html#ga4eb698c19004af6a07c2b2a666f0c1ce":[4,0,179,5],
-"group__vehicle.html#ga61b39294176ee679d4d1229d3ad95966":[2,11,58],
+"group__vehicle.html#ga4eb698c19004af6a07c2b2a666f0c1ce":[2,11,67],
 "group__vehicle.html#ga61b39294176ee679d4d1229d3ad95966":[4,0,184,2],
+"group__vehicle.html#ga61b39294176ee679d4d1229d3ad95966":[2,11,58],
 "group__vehicle.html#ga784f2317a01aa2367b6170bfabdc5097":[4,0,184,1],
 "group__vehicle.html#ga784f2317a01aa2367b6170bfabdc5097":[2,11,57],
 "group__vehicle.html#ga7c8330d753c24dc9a14d11a0355768a9":[2,11,66],
 "group__vehicle.html#ga7c8330d753c24dc9a14d11a0355768a9":[4,0,179,4],
 "group__vehicle.html#ga89b530b003b89280677e09327fbdb606":[4,0,184,6],
 "group__vehicle.html#ga89b530b003b89280677e09327fbdb606":[2,11,62],
+"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[2,11,50],
 "group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,178,0],
-"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,176,0],
-"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,187,0],
-"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,173,0],
 "group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,184,0],
 "group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,175,0],
+"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,176,0],
+"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,174,0],
+"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,173,0],
 "group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,177,0],
 "group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,181,0],
-"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[2,11,50],
-"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,174,0]
+"group__vehicle.html#ga8d4cfac998db5828bcfff2253c3d1fcc":[4,0,187,0],
+"group__vehicle.html#ga946f4d23cff6fdf36dca694641e20779":[4,0,185,4]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex18.js b/physx/documentation/PhysXAPI/files/navtreeindex18.js
index 31b4eae9e..c1c1cad67 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex18.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex18.js
@@ -1,29 +1,28 @@
 var NAVTREEINDEX18 =
 {
-"group__vehicle.html#ga946f4d23cff6fdf36dca694641e20779":[4,0,185,4],
 "group__vehicle.html#ga946f4d23cff6fdf36dca694641e20779":[2,11,73],
 "group__vehicle.html#ga9e1512107497cdfcc620b575d3c135e6":[2,11,51],
 "group__vehicle.html#ga9e1512107497cdfcc620b575d3c135e6":[4,0,179,2],
 "group__vehicle.html#gaa3f9a31ecbd63962ee372a8b922dd23c":[2,11,56],
 "group__vehicle.html#gaa3f9a31ecbd63962ee372a8b922dd23c":[4,0,185,3],
-"group__vehicle.html#gaa5280613d99e11cf175331dfff96df34":[4,0,185,1],
 "group__vehicle.html#gaa5280613d99e11cf175331dfff96df34":[2,11,54],
+"group__vehicle.html#gaa5280613d99e11cf175331dfff96df34":[4,0,185,1],
 "group__vehicle.html#gaa52f6479b78eccd706af05e1e0a0c43f":[4,0,179,1],
 "group__vehicle.html#gaa52f6479b78eccd706af05e1e0a0c43f":[2,11,48],
 "group__vehicle.html#gaa537690bce749bdce4e5eca80d43d53a":[4,0,184,4],
 "group__vehicle.html#gaa537690bce749bdce4e5eca80d43d53a":[2,11,60],
 "group__vehicle.html#gaa7aaf3de1f2da04c2617a36d053398ca":[4,0,179,3],
 "group__vehicle.html#gaa7aaf3de1f2da04c2617a36d053398ca":[2,11,52],
-"group__vehicle.html#gabfa604162177064bc35b16627f2053d1":[2,11,55],
 "group__vehicle.html#gabfa604162177064bc35b16627f2053d1":[4,0,185,2],
-"group__vehicle.html#gae62e4d3360ae24537d7fe2100e79f24c":[4,0,182,2],
+"group__vehicle.html#gabfa604162177064bc35b16627f2053d1":[2,11,55],
 "group__vehicle.html#gae62e4d3360ae24537d7fe2100e79f24c":[2,11,70],
-"group__vehicle.html#gae7390efc088e2d8624da2ba007bfbb6e":[2,11,63],
+"group__vehicle.html#gae62e4d3360ae24537d7fe2100e79f24c":[4,0,182,2],
 "group__vehicle.html#gae7390efc088e2d8624da2ba007bfbb6e":[4,0,183,0],
+"group__vehicle.html#gae7390efc088e2d8624da2ba007bfbb6e":[2,11,63],
 "group__vehicle.html#gaf1eb208ee9ffdbfa91d20ee3bb5087e0":[2,11,68],
 "group__vehicle.html#gaf1eb208ee9ffdbfa91d20ee3bb5087e0":[4,0,179,6],
-"group__vehicle.html#gaf9349122c5cf95fec09bd09ce5e540d2":[2,11,65],
 "group__vehicle.html#gaf9349122c5cf95fec09bd09ce5e540d2":[4,0,182,1],
+"group__vehicle.html#gaf9349122c5cf95fec09bd09ce5e540d2":[2,11,65],
 "hierarchy.html":[3,2],
 "index.html":[],
 "index.html#intro_sec":[0],
@@ -97,10 +96,8 @@ var NAVTREEINDEX18 =
 "structPxArticulationFlag.html#a89b0e25f041302ae04e5cc0b509a7609a625f41920c4a20ea5317fdc939a74674":[2,2,18,0,0],
 "structPxArticulationJointDriveType.html":[2,2,10],
 "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2":[2,2,10,0],
-"structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372":[2,2,10,0,2],
-"structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e":[2,2,10,0,3],
-"structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e":[2,2,10,0,1],
-"structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6":[2,2,10,0,0],
+"structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372":[2,2,10,0,1],
+"structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e":[2,2,10,0,0],
 "structPxArticulationJointType.html":[2,2,13],
 "structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0b":[2,2,13,0],
 "structPxArticulationJointType.html#a0df5bb8217acae97e63421bedbfe0a0ba20eb6c657a0b29898c2ab308b2e68ba3":[2,2,13,0,0],
@@ -249,5 +246,8 @@ var NAVTREEINDEX18 =
 "structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a519966ef860b18d3609d7c08f66389ef":[2,2,32,0,8],
 "structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a6ad8d7391857323f035ce2ebddee7595":[2,2,32,0,4],
 "structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a7b8997cb72e8bfe7189e62aaa861a820":[2,2,32,0,9],
-"structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a8e00c572e54f3575284ff41432f1bec6":[2,2,32,0,10]
+"structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a8e00c572e54f3575284ff41432f1bec6":[2,2,32,0,10],
+"structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a91151f48e43ade5a5d7153f1b3160696":[2,2,32,0,0],
+"structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427aef261aa0d46aba737d058fb32bc87c02":[2,2,32,0,5],
+"structPxConstraintInfo.html":[2,2,99]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex19.js b/physx/documentation/PhysXAPI/files/navtreeindex19.js
index 18e0e1b1c..a4cfdc1f4 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex19.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex19.js
@@ -1,8 +1,5 @@
 var NAVTREEINDEX19 =
 {
-"structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427a91151f48e43ade5a5d7153f1b3160696":[2,2,32,0,0],
-"structPxConstraintFlag.html#a86960e99b3e80ddb9e0ab51d7afc3427aef261aa0d46aba737d058fb32bc87c02":[2,2,32,0,5],
-"structPxConstraintInfo.html":[2,2,99],
 "structPxConstraintInfo.html#a073f7587ffb496c4bdc470eb34032fd2":[2,2,99,0],
 "structPxConstraintInfo.html#a324fadc19904fff635aa9d16da7a21f9":[2,2,99,4],
 "structPxConstraintInfo.html#a38e7eda9e38f5318b9d4dc27fa46316b":[2,2,99,2],
@@ -249,5 +246,8 @@ var NAVTREEINDEX19 =
 "structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42":[2,5,6,0],
 "structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42a01b3d1321ecf14bb165447a575dacfb5":[2,5,6,0,2],
 "structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42a286d9d3815141af3d638cda466ee4396":[2,5,6,0,3],
-"structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42aefc22064dc3a0abe956c25394a5261d2":[2,5,6,0,0]
+"structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42aefc22064dc3a0abe956c25394a5261d2":[2,5,6,0,0],
+"structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42affbc9585c9485ff2719bbfdd27df3b90":[2,5,6,0,1],
+"structPxConvexMeshCookingType.html":[2,5,7],
+"structPxConvexMeshCookingType.html#a5a2ae723aca74c185675cd7ba2c9c115":[2,5,7,0]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex2.js b/physx/documentation/PhysXAPI/files/navtreeindex2.js
index 4997796e2..9217b7590 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex2.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex2.js
@@ -225,10 +225,10 @@ var NAVTREEINDEX2 =
 "classPxArticulationReducedCoordinate.html#a0a5026aa7fa58fd96a3feb3808427b71":[2,2,22,6],
 "classPxArticulationReducedCoordinate.html#a0d9eeb50991fab399d0d7dbe756c1106":[2,2,22,3],
 "classPxArticulationReducedCoordinate.html#a10773e54bc023d0a4c6627c363fe1b23":[2,2,22,15],
-"classPxArticulationReducedCoordinate.html#a10d36a5ac6bb414b2240774cabdc4695":[2,2,22,29],
+"classPxArticulationReducedCoordinate.html#a10d36a5ac6bb414b2240774cabdc4695":[2,2,22,30],
 "classPxArticulationReducedCoordinate.html#a15d00fc95ae61c2906977453e7597d9b":[2,2,22,24],
 "classPxArticulationReducedCoordinate.html#a1a6a5ea9fbce5fab1389809cde5ce12a":[2,2,22,2],
-"classPxArticulationReducedCoordinate.html#a2d067a41dc5bcbbdbf9a298d9064b310":[2,2,22,28],
+"classPxArticulationReducedCoordinate.html#a2d067a41dc5bcbbdbf9a298d9064b310":[2,2,22,29],
 "classPxArticulationReducedCoordinate.html#a3202bc9b19b72c88c7ea32421dce3cc9":[2,2,22,0],
 "classPxArticulationReducedCoordinate.html#a32973e41c51f9dd6ed0168c401d620d7":[2,2,22,23],
 "classPxArticulationReducedCoordinate.html#a3e8a26a6e87d34c31441d766b0d6a153":[2,2,22,17],
@@ -237,17 +237,17 @@ var NAVTREEINDEX2 =
 "classPxArticulationReducedCoordinate.html#a51b6d2bcecd81b268219e7a5d1d8f10c":[2,2,22,1],
 "classPxArticulationReducedCoordinate.html#a5a611783b278756d562f4dfe0626af18":[2,2,22,21],
 "classPxArticulationReducedCoordinate.html#a86107c6d84d3132b6a46bb9b2b48666e":[2,2,22,18],
-"classPxArticulationReducedCoordinate.html#a919526ace70d5aaa4fb34ad0fcc98fd0":[2,2,22,30],
+"classPxArticulationReducedCoordinate.html#a919526ace70d5aaa4fb34ad0fcc98fd0":[2,2,22,31],
 "classPxArticulationReducedCoordinate.html#a92e7a37bc7b058364398265b80996607":[2,2,22,10],
 "classPxArticulationReducedCoordinate.html#a95e6f19bc92d7003e4d735a26b6ebb46":[2,2,22,8],
 "classPxArticulationReducedCoordinate.html#a96a853b57a61275d4c67828f34e3321d":[2,2,22,19],
 "classPxArticulationReducedCoordinate.html#a99f7f83876a3414eb386c83e173f9364":[2,2,22,16],
 "classPxArticulationReducedCoordinate.html#aa89014cf296753fd9ddffbd62ef46c4c":[2,2,22,22],
 "classPxArticulationReducedCoordinate.html#ab14e3d8440c6dc754fa3318a7c91991d":[2,2,22,4],
-"classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564":[2,2,22,27],
+"classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564":[2,2,22,28],
 "classPxArticulationReducedCoordinate.html#abe5b98d8b80aa4781ad1d5e9baffad25":[2,2,22,5],
 "classPxArticulationReducedCoordinate.html#ad6baf5f760905b778c6d5478b9c63302":[2,2,22,13],
 "classPxArticulationReducedCoordinate.html#ae3b3c188ffd64c4ba7cfd7508efd63bf":[2,2,22,25],
-"classPxArticulationReducedCoordinate.html#af1947172f99feef066872b12d675d046":[2,2,22,7],
-"classPxArticulationReducedCoordinate.html#af8b5217bd48e6d1ccc8ba3e495c66df7":[2,2,22,9]
+"classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4":[2,2,22,27],
+"classPxArticulationReducedCoordinate.html#af1947172f99feef066872b12d675d046":[2,2,22,7]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex20.js b/physx/documentation/PhysXAPI/files/navtreeindex20.js
index 78ca3b74e..2f0cf25f1 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex20.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex20.js
@@ -1,8 +1,5 @@
 var NAVTREEINDEX20 =
 {
-"structPxConvexMeshCookingResult.html#ac1c7b1cbfc80b0af00bec95faf1ebf42affbc9585c9485ff2719bbfdd27df3b90":[2,5,6,0,1],
-"structPxConvexMeshCookingType.html":[2,5,7],
-"structPxConvexMeshCookingType.html#a5a2ae723aca74c185675cd7ba2c9c115":[2,5,7,0],
 "structPxConvexMeshCookingType.html#a5a2ae723aca74c185675cd7ba2c9c115a18916173251aae02128c79b69af46906":[2,5,7,0,0],
 "structPxConvexMeshGeometryFlag.html":[2,7,6],
 "structPxConvexMeshGeometryFlag.html#a4d8558cca9e3f622b336550b96c25662":[2,7,6,0],
@@ -249,5 +246,8 @@ var NAVTREEINDEX20 =
 "structPxHitCallback.html#a63b2851eaf99b190fc468e00e8d1e55d":[2,9,12,5],
 "structPxHitCallback.html#a7fbb5959c9fcc98834745644422210c3":[2,9,12,1],
 "structPxHitCallback.html#a86b66684b448ec43083edf3aa1983234":[2,9,12,2],
-"structPxHitCallback.html#a8ff687b42ac0273e730c714bf66dae49":[2,9,12,3]
+"structPxHitCallback.html#a8ff687b42ac0273e730c714bf66dae49":[2,9,12,3],
+"structPxHitCallback.html#a9c6fc25eb66c22dad2a35d5630d4ec68":[2,9,12,4],
+"structPxHitCallback.html#aa3fa478af9db3e043a540fca72a2f468":[2,9,12,9],
+"structPxHitCallback.html#ac20c6d1c2dd520d771da9b63f7f4f363":[2,9,12,6]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex21.js b/physx/documentation/PhysXAPI/files/navtreeindex21.js
index 4742f9678..4ae6df250 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex21.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex21.js
@@ -1,8 +1,5 @@
 var NAVTREEINDEX21 =
 {
-"structPxHitCallback.html#a9c6fc25eb66c22dad2a35d5630d4ec68":[2,9,12,4],
-"structPxHitCallback.html#aa3fa478af9db3e043a540fca72a2f468":[2,9,12,9],
-"structPxHitCallback.html#ac20c6d1c2dd520d771da9b63f7f4f363":[2,9,12,6],
 "structPxHitFlag.html":[2,9,5],
 "structPxHitFlag.html#a44c173f6ddf0522ffbd8fa3c585e6b64":[2,9,5,0],
 "structPxHitFlag.html#a44c173f6ddf0522ffbd8fa3c585e6b64a21e4769575db55be01d8f91e06c59b1a":[2,9,5,0,0],
@@ -249,5 +246,8 @@ var NAVTREEINDEX21 =
 "structPxRigidBodyData.html#aa523e1704f34568a599f4744f0b6cbbc":[2,10,0,7],
 "structPxRigidBodyData.html#adbfbf35988384b74c8c9fa4280a7c45b":[2,10,0,4],
 "structPxRigidBodyData.html#afe4e0a693288590b7ce8383bc3e2cd79":[2,10,0,10],
-"structPxRigidBodyFlag.html":[2,2,65]
+"structPxRigidBodyFlag.html":[2,2,65],
+"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596":[2,2,65,0],
+"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596a37775d7c189d9f499fdfda010f73fb95":[2,2,65,0,3],
+"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596a686cdd889652a435c19d8c663d809d9e":[2,2,65,0,2]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex22.js b/physx/documentation/PhysXAPI/files/navtreeindex22.js
index 4a2144968..d44bf67c3 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex22.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex22.js
@@ -1,8 +1,5 @@
 var NAVTREEINDEX22 =
 {
-"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596":[2,2,65,0],
-"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596a37775d7c189d9f499fdfda010f73fb95":[2,2,65,0,3],
-"structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596a686cdd889652a435c19d8c663d809d9e":[2,2,65,0,2],
 "structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596a983c62745e575428d8b0aadefc23e8bc":[2,2,65,0,1],
 "structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596ab2bb4cc4246807076a2103d72bd69e00":[2,2,65,0,6],
 "structPxRigidBodyFlag.html#a5fd4878ae66a98c030a9d976e8ba8596abd76e6985e9db78efb7a66148ea4c212":[2,2,65,0,0],
@@ -60,16 +57,16 @@ var NAVTREEINDEX22 =
 "structPxSolverBody.html#abe151efeabf33d524e5589fc9897fbd7":[3,0,284,5],
 "structPxSolverBody.html#afa5845fa664df156d2a25300ea00593e":[3,0,284,0],
 "structPxSolverBodyData.html":[3,0,285],
-"structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415":[3,0,285,6],
+"structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415":[3,0,285,5],
 "structPxSolverBodyData.html#a1d32e78a191473936a6693b40185ad91":[3,0,285,3],
 "structPxSolverBodyData.html#a2ab7bc060f7946ae4f3bb1da17018326":[3,0,285,2],
 "structPxSolverBodyData.html#a5bee162de56be8899aa26c179e753919":[3,0,285,1],
-"structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6":[3,0,285,5],
+"structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6":[3,0,285,4],
 "structPxSolverBodyData.html#a74e634af20acbf0a37246a5ec44f2248":[3,0,285,10],
-"structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f":[3,0,285,4],
+"structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7":[3,0,285,8],
 "structPxSolverBodyData.html#aa443999488f683690658da520f1726c9":[3,0,285,0],
-"structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296":[3,0,285,8],
-"structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b":[3,0,285,7],
+"structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296":[3,0,285,7],
+"structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b":[3,0,285,6],
 "structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2":[3,0,285,9],
 "structPxSolverBodyData.html#aeaab06a3dfac94c9ccde5e40f966f8af":[3,0,285,11],
 "structPxSolverConstraintDesc.html":[3,0,286],
@@ -249,5 +246,8 @@ var NAVTREEINDEX22 =
 "structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a5c6c2015046be8d796e0103206aa59ff":[2,11,24,0,3],
 "structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a5cb2b2d8c58576c6a746d8ce8749808e":[2,11,24,0,5],
 "structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a6efcc9cfe91fb42e555ff21923ba0328":[2,11,24,0,2],
-"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a74a5e899e35c8aabf662a0696ae881f2":[2,11,24,0,15]
+"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a74a5e899e35c8aabf662a0696ae881f2":[2,11,24,0,15],
+"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a7e60ef3f98ed6b6a2d76920c75a968c7":[2,11,24,0,18],
+"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a958d5a4ff912514b9aea476d68aef4ec":[2,11,24,0,17],
+"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7aa352c090f43ae152036629a0a2fb5703":[2,11,24,0,4]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex23.js b/physx/documentation/PhysXAPI/files/navtreeindex23.js
index 8647f4c98..73188d632 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex23.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex23.js
@@ -1,8 +1,5 @@
 var NAVTREEINDEX23 =
 {
-"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a7e60ef3f98ed6b6a2d76920c75a968c7":[2,11,24,0,18],
-"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7a958d5a4ff912514b9aea476d68aef4ec":[2,11,24,0,17],
-"structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7aa352c090f43ae152036629a0a2fb5703":[2,11,24,0,4],
 "structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7aa54dcdaac81056baa6b9257d27df1b83":[2,11,24,0,19],
 "structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7ab02c19296f2de9225bfeec08c33d5147":[2,11,24,0,1],
 "structPxVehicleDriveTankWheelOrder.html#ad777dedf3db655580fb92de098646cb7ab3f9967f4d119ccce0e77d6ed93e207f":[2,11,24,0,12],
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex3.js b/physx/documentation/PhysXAPI/files/navtreeindex3.js
index 5bc11125f..4d52f080e 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex3.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex3.js
@@ -1,5 +1,6 @@
 var NAVTREEINDEX3 =
 {
+"classPxArticulationReducedCoordinate.html#af8b5217bd48e6d1ccc8ba3e495c66df7":[2,2,22,9],
 "classPxArticulationReducedCoordinate.html#afc82b045f2afb76484ea967515c01067":[2,2,22,11],
 "classPxAssertHandler.html":[2,4,5],
 "classPxAssertHandler.html#a65e427030347eb6bd67d859f3e562563":[2,4,5,0],
@@ -248,6 +249,5 @@ var NAVTREEINDEX3 =
 "classPxContactSet.html#a2e374d99ff72fcf410a070ee3bf17abb":[2,2,43,27],
 "classPxContactSet.html#a360344f51cce5d01c69701e3dcc3747b":[2,2,43,16],
 "classPxContactSet.html#a614a3d5bbdfe9fccce114e0ce64913a8":[2,2,43,29],
-"classPxContactSet.html#a649785dca7f48629279cd0dccf73f18d":[2,2,43,0],
-"classPxContactSet.html#a6535e80819a6d2a5bbe38db6aa86684e":[2,2,43,2]
+"classPxContactSet.html#a649785dca7f48629279cd0dccf73f18d":[2,2,43,0]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex4.js b/physx/documentation/PhysXAPI/files/navtreeindex4.js
index edaaa4568..909f83e06 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex4.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex4.js
@@ -1,5 +1,6 @@
 var NAVTREEINDEX4 =
 {
+"classPxContactSet.html#a6535e80819a6d2a5bbe38db6aa86684e":[2,2,43,2],
 "classPxContactSet.html#a6b38910a19843d40816bc7807c482bff":[2,2,43,7],
 "classPxContactSet.html#a6c480bbee40ef09750a07e43dffe53dc":[2,2,43,6],
 "classPxContactSet.html#a7262db2928f9ba341c17909864250754":[2,2,43,12],
@@ -248,6 +249,5 @@ var NAVTREEINDEX4 =
 "classPxDeserializationContext.html":[2,1,17],
 "classPxDeserializationContext.html#a155f00275087cb2faf4819bca9d21989":[2,1,17,8],
 "classPxDeserializationContext.html#a18007433a2cc90ef8ccf4353fb8b123d":[2,1,17,0],
-"classPxDeserializationContext.html#a2299e472c6c4dc486cbd08385efd31d9":[2,1,17,1],
-"classPxDeserializationContext.html#a393b7ca347c967d330791d19b2e37006":[2,1,17,6]
+"classPxDeserializationContext.html#a2299e472c6c4dc486cbd08385efd31d9":[2,1,17,1]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex5.js b/physx/documentation/PhysXAPI/files/navtreeindex5.js
index ae9a2112e..88fa11fd6 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex5.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex5.js
@@ -1,5 +1,6 @@
 var NAVTREEINDEX5 =
 {
+"classPxDeserializationContext.html#a393b7ca347c967d330791d19b2e37006":[2,1,17,6],
 "classPxDeserializationContext.html#a5c9fff58c41cd679b10359b836851263":[2,1,17,5],
 "classPxDeserializationContext.html#a91c4d17f9b7f7acd3cd7a119fe701939":[2,1,17,9],
 "classPxDeserializationContext.html#ab190b79866da182afa6f3f23f48d1d37":[2,1,17,7],
@@ -248,6 +249,5 @@ var NAVTREEINDEX5 =
 "classPxInputStream.html#a74fdc1e5af87577e7cc13cc99894b2b3":[2,1,22,1],
 "classPxInputStream.html#ae9bbfe54d3bd96668f6dd19b30737e26":[2,1,22,0],
 "classPxJoint.html":[2,3,26],
-"classPxJoint.html#a116450043c073548c1c55d032014f7e7":[2,3,26,16],
-"classPxJoint.html#a1cfe86fe5c8131cea1b9b5ff9df7a014":[2,3,26,21]
+"classPxJoint.html#a116450043c073548c1c55d032014f7e7":[2,3,26,16]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex6.js b/physx/documentation/PhysXAPI/files/navtreeindex6.js
index 7674c78c8..2fa13d9ce 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex6.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex6.js
@@ -1,5 +1,6 @@
 var NAVTREEINDEX6 =
 {
+"classPxJoint.html#a1cfe86fe5c8131cea1b9b5ff9df7a014":[2,3,26,21],
 "classPxJoint.html#a1da1a62c8aa565a92ff25c83825d1657":[2,3,26,7],
 "classPxJoint.html#a2c43517c6bc6d77062c4563e45a1dd99":[2,3,26,0],
 "classPxJoint.html#a2f5f6c66b5e6933f8578a880a4a957ad":[2,3,26,27],
@@ -248,6 +249,5 @@ var NAVTREEINDEX6 =
 "classPxObstacleContext.html#a228a7b8cb5900251d7888ee0f4d912cd":[2,0,26,5],
 "classPxObstacleContext.html#a3ea7ddb35145ea2ca1184ae7d8ce7b15":[2,0,26,8],
 "classPxObstacleContext.html#a53b1408cf617fc505dc1bbf19f94f1ff":[2,0,26,0],
-"classPxObstacleContext.html#a73ea47d73adaeda1228fd5030ee8cdd2":[2,0,26,9],
-"classPxObstacleContext.html#a9390f860fb489f283c9f872af2777123":[2,0,26,3]
+"classPxObstacleContext.html#a73ea47d73adaeda1228fd5030ee8cdd2":[2,0,26,9]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex7.js b/physx/documentation/PhysXAPI/files/navtreeindex7.js
index 4986bee2c..dddd59ef7 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex7.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex7.js
@@ -1,5 +1,6 @@
 var NAVTREEINDEX7 =
 {
+"classPxObstacleContext.html#a9390f860fb489f283c9f872af2777123":[2,0,26,3],
 "classPxObstacleContext.html#aaf653d99de6f092896cc1baa2a4e99c2":[2,0,26,7],
 "classPxObstacleContext.html#adf341c13bb11532604ee7012228061f6":[2,0,26,1],
 "classPxObstacleContext.html#ae55da4495a10d1c310068e23d87ce7d5":[2,0,26,4],
@@ -248,6 +249,5 @@ var NAVTREEINDEX7 =
 "classPxRigidBody.html#a755d0c8a8d1dd8b29e59d50a6dfda5fd":[2,2,66,31],
 "classPxRigidBody.html#a8052320915132f7d7f72ede4543b64ad":[2,2,66,17],
 "classPxRigidBody.html#a8a697a7a4b9bdd2c83a68e84b9bc3a35":[2,2,66,30],
-"classPxRigidBody.html#a9b23b890404b1010bf0b67a225bd22e7":[2,2,66,37],
-"classPxRigidBody.html#a9e6f3afd71605e037a5de47955d941e0":[2,2,66,38]
+"classPxRigidBody.html#a9b23b890404b1010bf0b67a225bd22e7":[2,2,66,37]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex8.js b/physx/documentation/PhysXAPI/files/navtreeindex8.js
index 6499043c0..b1b549b48 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex8.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex8.js
@@ -1,5 +1,6 @@
 var NAVTREEINDEX8 =
 {
+"classPxRigidBody.html#a9e6f3afd71605e037a5de47955d941e0":[2,2,66,38],
 "classPxRigidBody.html#aafdbdab1865112b15201aeabb23877b4":[2,2,66,9],
 "classPxRigidBody.html#aaffbfa748d3a75e79a13dbca92aaa7ff":[2,2,66,7],
 "classPxRigidBody.html#ab152773926fe7b222d61e982c3cb6adf":[2,2,66,26],
@@ -35,31 +36,29 @@ var NAVTREEINDEX8 =
 "classPxRigidBodyExt.html#adfc2b24aa3d9207ce6af21094151aede":[2,3,43,11],
 "classPxRigidBodyExt.html#afed73713b7c1032a7ce921c7de25a278":[2,3,43,9],
 "classPxRigidDynamic.html":[2,2,68],
-"classPxRigidDynamic.html#a0f7de8c56c76461555219b2b98d5147a":[2,2,68,23],
-"classPxRigidDynamic.html#a16f9f0dfeae6e7877bcebca80df42c92":[2,2,68,19],
-"classPxRigidDynamic.html#a1a0232d5a392e204e6b133fd2ab0879e":[2,2,68,8],
-"classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1":[2,2,68,12],
-"classPxRigidDynamic.html#a2fcdafcbc0dd5a691d502b2f7b03a7d7":[2,2,68,9],
-"classPxRigidDynamic.html#a3b6ed86370cfd04958ca7e1c637e310b":[2,2,68,6],
-"classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527":[2,2,68,4],
-"classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93":[2,2,68,17],
-"classPxRigidDynamic.html#a4f1b9e9f437d8b8f27a6e9c2d9ea22fd":[2,2,68,24],
-"classPxRigidDynamic.html#a59fb22d9402497cffe1b5e54d9ecd62f":[2,2,68,22],
-"classPxRigidDynamic.html#a5f9d98c9bd3b9ab0be7ce1b3ec4a4574":[2,2,68,10],
-"classPxRigidDynamic.html#a7cf3e84117da3ec5b499262a3a9f5521":[2,2,68,18],
-"classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371":[2,2,68,16],
-"classPxRigidDynamic.html#a88dc31fa470c5f37e3e9683534ad83aa":[2,2,68,11],
+"classPxRigidDynamic.html#a0f7de8c56c76461555219b2b98d5147a":[2,2,68,21],
+"classPxRigidDynamic.html#a16f9f0dfeae6e7877bcebca80df42c92":[2,2,68,17],
+"classPxRigidDynamic.html#a1a0232d5a392e204e6b133fd2ab0879e":[2,2,68,7],
+"classPxRigidDynamic.html#a2e1d23ebdce36455824949be867128d1":[2,2,68,11],
+"classPxRigidDynamic.html#a2fcdafcbc0dd5a691d502b2f7b03a7d7":[2,2,68,8],
+"classPxRigidDynamic.html#a3b6ed86370cfd04958ca7e1c637e310b":[2,2,68,5],
+"classPxRigidDynamic.html#a43150714d9e16cd056149bad325f0527":[2,2,68,3],
+"classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93":[2,2,68,15],
+"classPxRigidDynamic.html#a4f1b9e9f437d8b8f27a6e9c2d9ea22fd":[2,2,68,22],
+"classPxRigidDynamic.html#a59fb22d9402497cffe1b5e54d9ecd62f":[2,2,68,20],
+"classPxRigidDynamic.html#a5f9d98c9bd3b9ab0be7ce1b3ec4a4574":[2,2,68,9],
+"classPxRigidDynamic.html#a7cf3e84117da3ec5b499262a3a9f5521":[2,2,68,16],
+"classPxRigidDynamic.html#a88dc31fa470c5f37e3e9683534ad83aa":[2,2,68,10],
 "classPxRigidDynamic.html#a9380b2f317161026b39c6a296ba9e273":[2,2,68,2],
 "classPxRigidDynamic.html#a9e63c1ec6872048e16f7d6b744ec3eef":[2,2,68,1],
-"classPxRigidDynamic.html#aa88e055440d6c1cc846e32dbed017345":[2,2,68,7],
-"classPxRigidDynamic.html#aae00aa2067a2fe268b999aad04f27c28":[2,2,68,14],
-"classPxRigidDynamic.html#aaf537a84cd9b119ed5f77888c600f2ae":[2,2,68,13],
+"classPxRigidDynamic.html#aa88e055440d6c1cc846e32dbed017345":[2,2,68,6],
+"classPxRigidDynamic.html#aae00aa2067a2fe268b999aad04f27c28":[2,2,68,13],
+"classPxRigidDynamic.html#aaf537a84cd9b119ed5f77888c600f2ae":[2,2,68,12],
 "classPxRigidDynamic.html#ab21ed0e4e1be20fc975fcd3ceeb4e064":[2,2,68,0],
-"classPxRigidDynamic.html#abfd510964f2287e81fe76a1a9e1725b5":[2,2,68,15],
-"classPxRigidDynamic.html#ac087213f7360a202eaccfab26e97e644":[2,2,68,20],
-"classPxRigidDynamic.html#acc91baffa652090e523df5e6627686ed":[2,2,68,5],
-"classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c":[2,2,68,3],
-"classPxRigidDynamic.html#afa68d947962bc2d0f8862caaf3e8b304":[2,2,68,21],
+"classPxRigidDynamic.html#abfd510964f2287e81fe76a1a9e1725b5":[2,2,68,14],
+"classPxRigidDynamic.html#ac087213f7360a202eaccfab26e97e644":[2,2,68,18],
+"classPxRigidDynamic.html#acc91baffa652090e523df5e6627686ed":[2,2,68,4],
+"classPxRigidDynamic.html#afa68d947962bc2d0f8862caaf3e8b304":[2,2,68,19],
 "classPxRigidStatic.html":[2,2,69],
 "classPxRigidStatic.html#a33f405d68b5ec1e8e60d0b361151645c":[2,2,69,0],
 "classPxRigidStatic.html#a7881e50c535f44a6d69f40af3baa7ffc":[2,2,69,1],
@@ -249,5 +248,6 @@ var NAVTREEINDEX8 =
 "classPxSerialization.html":[2,3,45],
 "classPxSerialization.html#a0cf59eda36c56e9e66bac0c2b5168094":[2,3,45,1],
 "classPxSerialization.html#a1d6242bea0c7513bfbfb24670e5cfb03":[2,3,45,9],
-"classPxSerialization.html#a33d7707fb8d82862773a6b9ae1d4782e":[2,3,45,5]
+"classPxSerialization.html#a33d7707fb8d82862773a6b9ae1d4782e":[2,3,45,5],
+"classPxSerialization.html#a6c64dc9a55f7ae9ffc8c1802855134ed":[2,3,45,3]
 };
diff --git a/physx/documentation/PhysXAPI/files/navtreeindex9.js b/physx/documentation/PhysXAPI/files/navtreeindex9.js
index 844b0772a..a4ad00cc5 100644
--- a/physx/documentation/PhysXAPI/files/navtreeindex9.js
+++ b/physx/documentation/PhysXAPI/files/navtreeindex9.js
@@ -1,6 +1,5 @@
 var NAVTREEINDEX9 =
 {
-"classPxSerialization.html#a6c64dc9a55f7ae9ffc8c1802855134ed":[2,3,45,3],
 "classPxSerialization.html#a84d55e6e21727087653a70cc20557765":[2,3,45,6],
 "classPxSerialization.html#a86026f68eb651de87e4a7720bc708893":[2,3,45,7],
 "classPxSerialization.html#a9014b476bfd9e3e9bc7ea755df232bfd":[2,3,45,2],
@@ -249,5 +248,6 @@ var NAVTREEINDEX9 =
 "classPxTriangleMesh.html#a08c203991bc3ad2be59ad6ab048197dc":[2,7,29,12],
 "classPxTriangleMesh.html#a0fc4aed5a5f01a0f1b769d3c6534ba8d":[2,7,29,7],
 "classPxTriangleMesh.html#a168cc0000372c63857d128d04f1d32fa":[2,7,29,16],
-"classPxTriangleMesh.html#a1b5db3f6060849da98024a04ceb991e5":[2,7,29,10]
+"classPxTriangleMesh.html#a1b5db3f6060849da98024a04ceb991e5":[2,7,29,10],
+"classPxTriangleMesh.html#a261ef2fa9709a163408b41d2c34dda71":[2,7,29,13]
 };
diff --git a/physx/documentation/PhysXAPI/files/search/all_10.js b/physx/documentation/PhysXAPI/files/search/all_10.js
index 2efff0dec..9f950d38b 100644
--- a/physx/documentation/PhysXAPI/files/search/all_10.js
+++ b/physx/documentation/PhysXAPI/files/search/all_10.js
@@ -5,7 +5,7 @@ var searchData=
   ['intrinsics',['intrinsics',['../namespacephysx_1_1intrinsics.html',1,'physx']]],
   ['p',['p',['../classPxTransform.html#ab1a41ac5f4df8e07ba590fa0a8a6e786',1,'PxTransform']]],
   ['packjointdata',['packJointData',['../classPxArticulationReducedCoordinate.html#a32973e41c51f9dd6ed0168c401d620d7',1,'PxArticulationReducedCoordinate']]],
-  ['pad',['pad',['../classphysx_1_1Gu_1_1ContactBuffer.html#a4f8fabbb2c9690d8fb30f8b6b1459294',1,'physx::Gu::ContactBuffer::pad()'],['../structPxBatchQueryResult.html#ad0b4a79dc77732dc7bbe5c56f50bd4a6',1,'PxBatchQueryResult::pad()'],['../structPxRigidBodyData.html#a1bb93cacad36fcc17492afe83718f8c7',1,'PxRigidBodyData::pad()'],['../structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be',1,'PxSolverContactDesc::pad()'],['../classPxVehicleChassisData.html#a7e37b2a81f834c76e9b00bf9b7358421',1,'PxVehicleChassisData::pad()']]],
+  ['pad',['pad',['../classphysx_1_1Gu_1_1ContactBuffer.html#a4f8fabbb2c9690d8fb30f8b6b1459294',1,'physx::Gu::ContactBuffer::pad()'],['../structPxBatchQueryResult.html#ad0b4a79dc77732dc7bbe5c56f50bd4a6',1,'PxBatchQueryResult::pad()'],['../structPxRigidBodyData.html#a1bb93cacad36fcc17492afe83718f8c7',1,'PxRigidBodyData::pad()'],['../structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7',1,'PxSolverBodyData::pad()'],['../structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be',1,'PxSolverContactDesc::pad()'],['../classPxVehicleChassisData.html#a7e37b2a81f834c76e9b00bf9b7358421',1,'PxVehicleChassisData::pad()']]],
   ['paddingfromflags',['paddingFromFlags',['../classPxConvexMeshGeometry.html#ad2bed15287e31918b5ca0051f0604eeb',1,'PxConvexMeshGeometry::paddingFromFlags()'],['../classPxHeightFieldGeometry.html#ac6320ee337895516d651d46be99217d6',1,'PxHeightFieldGeometry::paddingFromFlags()'],['../classPxTriangleMeshGeometry.html#a114bc6e19dc5bc001ef9cedd1b772a03',1,'PxTriangleMeshGeometry::paddingFromFlags()']]],
   ['padto16bytes',['padTo16Bytes',['../structPxRaycastHit.html#a251f56ec3d168a814467301f61a120c7',1,'PxRaycastHit::padTo16Bytes()'],['../structPxOverlapHit.html#a27b1ad19e9c07c2d6463707b6f18b684',1,'PxOverlapHit::padTo16Bytes()'],['../structPxSweepHit.html#a4bf1a7791ef7aa0d5030fdd2756a607f',1,'PxSweepHit::padTo16Bytes()']]],
   ['pairfound',['pairFound',['../classPxSimulationFilterCallback.html#aa434e6ecd4e0cfa7bede6c9ad10d319e',1,'PxSimulationFilterCallback']]],
@@ -14,7 +14,7 @@ var searchData=
   ['patch',['patch',['../structPxContactStreamIterator.html#ad789824d77c0eb542c0990f83d9470ce',1,'PxContactStreamIterator']]],
   ['patchcount',['patchCount',['../structPxContactPair.html#aac2821a1e5d8d8cc969b24f2a861a1a0',1,'PxContactPair']]],
   ['patchstreamsize',['patchStreamSize',['../structPxgDynamicsMemoryConfig.html#ac61756279f4ffcc17a6ae2f1712268f7',1,'PxgDynamicsMemoryConfig']]],
-  ['patchuppointers',['patchUpPointers',['../classPxVehicleWheelsSimData.html#a7b605042d1ef0fbd4dd84491e058c1cf',1,'PxVehicleWheelsSimData::patchUpPointers()'],['../classPxVehicleWheelsDynData.html#ae417ce1c175c365484f341dd537dc886',1,'PxVehicleWheelsDynData::patchUpPointers()'],['../classPxVehicleDrive.html#a8548cd5c20014542d09506ee33cba69e',1,'PxVehicleDrive::patchupPointers()'],['../classPxVehicleWheels.html#a07c073158a6df39214f655f59d130b69',1,'PxVehicleWheels::patchupPointers()']]],
+  ['patchuppointers',['patchupPointers',['../classPxVehicleDrive.html#a8548cd5c20014542d09506ee33cba69e',1,'PxVehicleDrive::patchupPointers()'],['../classPxVehicleWheels.html#a07c073158a6df39214f655f59d130b69',1,'PxVehicleWheels::patchupPointers()'],['../classPxVehicleWheelsSimData.html#a7b605042d1ef0fbd4dd84491e058c1cf',1,'PxVehicleWheelsSimData::patchUpPointers()'],['../classPxVehicleWheelsDynData.html#ae417ce1c175c365484f341dd537dc886',1,'PxVehicleWheelsDynData::patchUpPointers()']]],
   ['peakconstraintmemory',['peakConstraintMemory',['../classPxSimulationStatistics.html#ab56ac5aa68f31b68a0852c87817ff026',1,'PxSimulationStatistics']]],
   ['peek',['peek',['../classPxFileBuf.html#a63e0abac3fa4cf71760142dd7088ef36',1,'PxFileBuf']]],
   ['penbiasclamp',['penBiasClamp',['../structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2',1,'PxSolverBodyData']]],
diff --git a/physx/documentation/PhysXAPI/files/search/all_13.js b/physx/documentation/PhysXAPI/files/search/all_13.js
index 931c9b776..3c78ee510 100644
--- a/physx/documentation/PhysXAPI/files/search/all_13.js
+++ b/physx/documentation/PhysXAPI/files/search/all_13.js
@@ -46,6 +46,7 @@ var searchData=
   ['setangulardamping',['setAngularDamping',['../classPxRigidBody.html#adb8f357ce212cdf05250237073278d0c',1,'PxRigidBody']]],
   ['setangularvelocity',['setAngularVelocity',['../classPxRigidBody.html#ad49850630db14af26e019d2550ecfd27',1,'PxRigidBody']]],
   ['setantirollbardata',['setAntiRollBarData',['../classPxVehicleWheelsSimData.html#ae8706a8f39a630ba469414fb814fe206',1,'PxVehicleWheelsSimData']]],
+  ['setarticulationflag',['setArticulationFlag',['../classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4',1,'PxArticulationReducedCoordinate']]],
   ['setarticulationflags',['setArticulationFlags',['../classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564',1,'PxArticulationReducedCoordinate']]],
   ['setautoboxdata',['setAutoBoxData',['../classPxVehicleDriveSimData.html#ac7128888b46d993ff574d7eb3078e36d',1,'PxVehicleDriveSimData']]],
   ['setautoboxswitchtime',['setAutoBoxSwitchTime',['../classPxVehicleDriveDynData.html#a0de28a83d795f037d4042f0a5601087b',1,'PxVehicleDriveDynData']]],
@@ -145,7 +146,6 @@ var searchData=
   ['setinvmassscale0',['setInvMassScale0',['../classPxJoint.html#adf8d09205b60811980a2202f1265562b',1,'PxJoint::setInvMassScale0()'],['../classPxContactSet.html#a74648acc75bce20cfd555e76298b514f',1,'PxContactSet::setInvMassScale0()']]],
   ['setinvmassscale1',['setInvMassScale1',['../classPxJoint.html#a2f5f6c66b5e6933f8578a880a4a957ad',1,'PxJoint::setInvMassScale1()'],['../classPxContactSet.html#ad77be76292fb196ff129d70c372c5d97',1,'PxContactSet::setInvMassScale1()']]],
   ['setjointtype',['setJointType',['../classPxArticulationJointReducedCoordinate.html#ad6983182f5ea8c1e043c0f7e1ec3835b',1,'PxArticulationJointReducedCoordinate']]],
-  ['setkinematicsurfacevelocity',['setKinematicSurfaceVelocity',['../classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371',1,'PxRigidDynamic']]],
   ['setkinematictarget',['setKinematicTarget',['../classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93',1,'PxRigidDynamic']]],
   ['setlatency',['setLatency',['../classPxVehicleAutoBoxData.html#ab58a308ce4750594e3335d953d70281c',1,'PxVehicleAutoBoxData']]],
   ['setlimit',['setLimit',['../classPxPrismaticJoint.html#a7831c090d462d38eb7ba8e2fd25bc751',1,'PxPrismaticJoint::setLimit()'],['../classPxRevoluteJoint.html#ad28e13ae7149b0a09f8b5a9d8beaae60',1,'PxRevoluteJoint::setLimit()'],['../classPxArticulationJointReducedCoordinate.html#aab5482d5d928af3fa8aa6be716079461',1,'PxArticulationJointReducedCoordinate::setLimit()']]],
diff --git a/physx/documentation/PhysXAPI/files/search/all_14.js b/physx/documentation/PhysXAPI/files/search/all_14.js
index 257012196..5f4269259 100644
--- a/physx/documentation/PhysXAPI/files/search/all_14.js
+++ b/physx/documentation/PhysXAPI/files/search/all_14.js
@@ -44,7 +44,7 @@ var searchData=
   ['tt_5fcpu',['TT_CPU',['../structphysx_1_1PxTaskType.html#ae5545e371cf86793e4a0d395e4ebdb6ba27553f47b12c277e0cb685240cb8b9b3',1,'physx::PxTaskType']]],
   ['tt_5fgpu',['TT_GPU',['../structphysx_1_1PxTaskType.html#ae5545e371cf86793e4a0d395e4ebdb6ba4f4e2d64ede267a5b1afcbe74c35c3f7',1,'physx::PxTaskType']]],
   ['tt_5fnot_5fpresent',['TT_NOT_PRESENT',['../structphysx_1_1PxTaskType.html#ae5545e371cf86793e4a0d395e4ebdb6bac5b78680f329287a850b9f852c80acd2',1,'physx::PxTaskType']]],
-  ['type',['Type',['../structPx1DConstraintFlag.html#a0d9f59379f0d8eba3bace01f8bb88b76',1,'Px1DConstraintFlag::Type()'],['../structPxMetaDataEntry.html#af798a0e7beeb8caf4ddbd713fc0994c6',1,'PxMetaDataEntry::type()'],['../structPxContactPairExtraDataItem.html#a18e83c33b02a0a57f0daf818379e06d6',1,'PxContactPairExtraDataItem::type()'],['../structPxConstraintInfo.html#a324fadc19904fff635aa9d16da7a21f9',1,'PxConstraintInfo::type()']]],
+  ['type',['type',['../structPxMetaDataEntry.html#af798a0e7beeb8caf4ddbd713fc0994c6',1,'PxMetaDataEntry::type()'],['../structPxContactPairExtraDataItem.html#a18e83c33b02a0a57f0daf818379e06d6',1,'PxContactPairExtraDataItem::type()'],['../structPxConstraintInfo.html#a324fadc19904fff635aa9d16da7a21f9',1,'PxConstraintInfo::type()'],['../structPx1DConstraintFlag.html#a0d9f59379f0d8eba3bace01f8bb88b76',1,'Px1DConstraintFlag::Type()']]],
   ['typematch',['typeMatch',['../classPxBase.html#a3782ca64c8f12c41443f604e300fc207',1,'PxBase']]],
   ['typename',['typeName',['../structPxRepXObject.html#a0c78e70aff2c5ede506a1279562e494a',1,'PxRepXObject']]]
 ];
diff --git a/physx/documentation/PhysXAPI/files/search/all_17.js b/physx/documentation/PhysXAPI/files/search/all_17.js
index c6cdc4fb5..e47288b67 100644
--- a/physx/documentation/PhysXAPI/files/search/all_17.js
+++ b/physx/documentation/PhysXAPI/files/search/all_17.js
@@ -2,7 +2,7 @@ var searchData=
 [
   ['w',['w',['../classPxQuat.html#a84fb33d995ed4e6a26ac3f22523f0b54',1,'PxQuat::w()'],['../classPxVec4.html#a013d7a6e489c7361cbd602fcab6c9e29',1,'PxVec4::w()']]],
   ['wakecounterresetvalue',['wakeCounterResetValue',['../classPxSceneDesc.html#a79e2c9c06f711272a48d7f07451117b7',1,'PxSceneDesc']]],
-  ['wakeup',['wakeup',['../structPxVehicleConcurrentUpdateData.html#a279c8653c02f6ff3131082f4500d0bad',1,'PxVehicleConcurrentUpdateData::wakeup()'],['../classPxArticulationBase.html#a236592b36cced8478b3e4385c54007af',1,'PxArticulationBase::wakeUp()'],['../classPxRigidDynamic.html#a4f1b9e9f437d8b8f27a6e9c2d9ea22fd',1,'PxRigidDynamic::wakeUp()']]],
+  ['wakeup',['wakeUp',['../classPxArticulationBase.html#a236592b36cced8478b3e4385c54007af',1,'PxArticulationBase::wakeUp()'],['../classPxRigidDynamic.html#a4f1b9e9f437d8b8f27a6e9c2d9ea22fd',1,'PxRigidDynamic::wakeUp()'],['../structPxVehicleConcurrentUpdateData.html#a279c8653c02f6ff3131082f4500d0bad',1,'PxVehicleConcurrentUpdateData::wakeup()']]],
   ['wheelqueryresults',['wheelQueryResults',['../structPxVehicleWheelQueryResult.html#afa382a6fb104e8539622189b5d137366',1,'PxVehicleWheelQueryResult']]],
   ['word0',['word0',['../structPxFilterData.html#a326ddcb63323ef46e0ce68d0c17ece8f',1,'PxFilterData']]],
   ['word1',['word1',['../structPxFilterData.html#a11b78c3942ccb889a7e196dc5a640cdc',1,'PxFilterData']]],
diff --git a/physx/documentation/PhysXAPI/files/search/all_3.js b/physx/documentation/PhysXAPI/files/search/all_3.js
index e15270793..8c3c4b458 100644
--- a/physx/documentation/PhysXAPI/files/search/all_3.js
+++ b/physx/documentation/PhysXAPI/files/search/all_3.js
@@ -13,7 +13,6 @@ var searchData=
   ['clearbit',['clearBit',['../classPxBitAndDataT.html#a8b8421d063a772322282ec752fc07eee',1,'PxBitAndDataT']]],
   ['cleardataptrs',['clearDataPtrs',['../structPxContactPairExtraDataIterator.html#a1d2a22a64b21c547ad7e586307ca3afb',1,'PxContactPairExtraDataIterator']]],
   ['clearforce',['clearForce',['../classPxRigidBody.html#aefe5174aa160a07d488de2a18ba61f94',1,'PxRigidBody']]],
-  ['clearkinematicsurfacevelocity',['clearKinematicSurfaceVelocity',['../classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c',1,'PxRigidDynamic']]],
   ['cleartessflag',['clearTessFlag',['../structPxHeightFieldSample.html#aa28e24b4763400eb5bd84378a23e086f',1,'PxHeightFieldSample']]],
   ['cleartorque',['clearTorque',['../classPxRigidBody.html#ae251132385cc0b082b612d709aa4375d',1,'PxRigidBody']]],
   ['climbingmode',['climbingMode',['../classPxCapsuleControllerDesc.html#a89888e84bf3091c6c3e7818732864674',1,'PxCapsuleControllerDesc']]],
diff --git a/physx/documentation/PhysXAPI/files/search/all_5.js b/physx/documentation/PhysXAPI/files/search/all_5.js
index 169cddd40..2d44f92a6 100644
--- a/physx/documentation/PhysXAPI/files/search/all_5.js
+++ b/physx/documentation/PhysXAPI/files/search/all_5.js
@@ -306,7 +306,7 @@ var searchData=
   ['eno_5fboundary_5fedges',['eNO_BOUNDARY_EDGES',['../structPxHeightFieldFlag.html#ad94e98dce64e8c6f9806a20548548668ab602d69a9ee1ab863ab17f503592ff2e',1,'PxHeightFieldFlag']]],
   ['eno_5ferror',['eNO_ERROR',['../structPxErrorCode.html#a01157d5d2eca2cede61ecbbc32c7b2a3a3f3e15f9946df5f301d1fe4b656df267',1,'PxErrorCode']]],
   ['enodrive',['eNODRIVE',['../structPxVehicleTypes.html#a8e3dab2de8d47cbd9d22c8e27af4bacba55b0f71d4462431b819f71266e0044e6',1,'PxVehicleTypes']]],
-  ['enone',['eNONE',['../structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9cab6869486b64cea747bc34cec4403c261',1,'PxControllerDebugRenderFlag::eNONE()'],['../structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a5ebeeface34f3fdc582dffcfb718ec74',1,'PxConverterReportMode::eNONE()'],['../structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6',1,'PxArticulationJointDriveType::eNONE()'],['../structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef',1,'PxConstraintSolveHint::eNONE()'],['../structPxQueryHitType.html#acab1ef408184995d169330ed689b79e3a7e26b51ca324ac45d44d4b70203a2996',1,'PxQueryHitType::eNONE()'],['../structPxPruningStructureType.html#a058454782f6ed1cc953b8d6561b636d9a667e19208a1fd5b2215e452ac9754b6e',1,'PxPruningStructureType::eNONE()']]],
+  ['enone',['eNONE',['../structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9cab6869486b64cea747bc34cec4403c261',1,'PxControllerDebugRenderFlag::eNONE()'],['../structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a5ebeeface34f3fdc582dffcfb718ec74',1,'PxConverterReportMode::eNONE()'],['../structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef',1,'PxConstraintSolveHint::eNONE()'],['../structPxQueryHitType.html#acab1ef408184995d169330ed689b79e3a7e26b51ca324ac45d44d4b70203a2996',1,'PxQueryHitType::eNONE()'],['../structPxPruningStructureType.html#a058454782f6ed1cc953b8d6561b636d9a667e19208a1fd5b2215e452ac9754b6e',1,'PxPruningStructureType::eNONE()']]],
   ['enormal',['eNORMAL',['../structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a74cb273dc39f72e0ae096cc059917946',1,'PxConverterReportMode::eNORMAL()'],['../structPxHitFlag.html#a44c173f6ddf0522ffbd8fa3c585e6b64a5972f664c2fbe85c992fb9aaba46a5cf',1,'PxHitFlag::eNORMAL()']]],
   ['enotify',['eNOTIFY',['../structPxFilterFlag.html#a8de424e04d86b5772436423b0dc58083afbb0680c3ea5447f44fc90560bd7dd8f',1,'PxFilterFlag']]],
   ['enotify_5fcontact_5fpoints',['eNOTIFY_CONTACT_POINTS',['../structPxPairFlag.html#a60e71a2948b030140f840766a3f7ac2fadb30cb0c2d2859560b34c684ac076137',1,'PxPairFlag']]],
@@ -408,7 +408,6 @@ var searchData=
   ['eswing1',['eSWING1',['../structPxD6Axis.html#a0a15eb4ffcf285b3bd8289ddca128880ae9f4bbfb50eaf463e7b1dd406ed1957b',1,'PxD6Axis::eSWING1()'],['../structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a2e1a274c036f8b7846476dbfe2015fec',1,'PxArticulationAxis::eSWING1()']]],
   ['eswing2',['eSWING2',['../structPxD6Axis.html#a0a15eb4ffcf285b3bd8289ddca128880ac6058852beed87da5e9e10f153358f60',1,'PxD6Axis::eSWING2()'],['../structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a33f154c8d454ab3f08f03f512cc03972',1,'PxArticulationAxis::eSWING2()']]],
   ['etarget',['eTARGET',['../structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e',1,'PxArticulationJointDriveType']]],
-  ['etargetvelocity',['eTARGETVELOCITY',['../structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e',1,'PxArticulationJointDriveType']]],
   ['etemporal_5fbv',['eTEMPORAL_BV',['../structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9caebb20da3030ba4d86cc768d774fa21ae',1,'PxControllerDebugRenderFlag']]],
   ['etenth',['eTENTH',['../classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a476cfd6e181dbe556b5f84b65f3a94b6',1,'PxVehicleGearsData']]],
   ['etgs',['eTGS',['../structPxSolverType.html#a96adc2f8785c6b18805a18b03289dbe6ac62cf152c5d6a5e1a12eebc81b0b6e6d',1,'PxSolverType']]],
diff --git a/physx/documentation/PhysXAPI/files/search/all_8.js b/physx/documentation/PhysXAPI/files/search/all_8.js
index 6072d780c..bd849e08b 100644
--- a/physx/documentation/PhysXAPI/files/search/all_8.js
+++ b/physx/documentation/PhysXAPI/files/search/all_8.js
@@ -13,7 +13,6 @@ var searchData=
   ['hasnegativedeterminant',['hasNegativeDeterminant',['../classPxMeshScale.html#a4eb75b946d25a2f78719cf1a752b9625',1,'PxMeshScale']]],
   ['hasnextcontact',['hasNextContact',['../structPxContactStreamIterator.html#afdf9c03916d76f61dec80c93c488f87a',1,'PxContactStreamIterator']]],
   ['hasnextpatch',['hasNextPatch',['../structPxContactStreamIterator.html#ac4823a352cbda39e7e434df43f5838f6',1,'PxContactStreamIterator']]],
-  ['hassurfacevelocity',['hasSurfaceVelocity',['../structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f',1,'PxSolverBodyData']]],
   ['heapcapacity',['heapCapacity',['../structPxgDynamicsMemoryConfig.html#a548423b552a06d77a3c0066d768a4dd8',1,'PxgDynamicsMemoryConfig']]],
   ['height',['height',['../classPxCapsuleControllerDesc.html#a00dcfa7bd1c529bf745fdf8849746a97',1,'PxCapsuleControllerDesc::height()'],['../structPxHeightFieldSample.html#acfdb9fa8a4f56b5c3354cd552768a026',1,'PxHeightFieldSample::height()']]],
   ['heightfield',['heightfield',['../classPxGeometryHolder.html#a13086e315b9865701c5d4c03031723ca',1,'PxGeometryHolder::heightfield()'],['../classPxHeightFieldGeometry.html#ade90caef2f95d56481d625ffba6ad918',1,'PxHeightFieldGeometry::heightField()'],['../classPxGeometryHolder.html#aa93f73423d7bd5d0a528df4ba8c51ada',1,'PxGeometryHolder::heightField()'],['../classPxGeometryHolder.html#a94cd64bea37591f56a230b22a8272042',1,'PxGeometryHolder::heightField() const'],['../PxGeometryHelpers_8h.html#af65253b111613855925e410eec62c62f',1,'heightField():&#160;PxGeometryHelpers.h'],['../PxGeometryHelpers_8h.html#a6e3ac546318539c770d1b4894563d0ae',1,'heightfield():&#160;PxGeometryHelpers.h']]],
diff --git a/physx/documentation/PhysXAPI/files/search/enumvalues_2.js b/physx/documentation/PhysXAPI/files/search/enumvalues_2.js
index bf73b08c3..0c7abe309 100644
--- a/physx/documentation/PhysXAPI/files/search/enumvalues_2.js
+++ b/physx/documentation/PhysXAPI/files/search/enumvalues_2.js
@@ -302,7 +302,7 @@ var searchData=
   ['eno_5fboundary_5fedges',['eNO_BOUNDARY_EDGES',['../structPxHeightFieldFlag.html#ad94e98dce64e8c6f9806a20548548668ab602d69a9ee1ab863ab17f503592ff2e',1,'PxHeightFieldFlag']]],
   ['eno_5ferror',['eNO_ERROR',['../structPxErrorCode.html#a01157d5d2eca2cede61ecbbc32c7b2a3a3f3e15f9946df5f301d1fe4b656df267',1,'PxErrorCode']]],
   ['enodrive',['eNODRIVE',['../structPxVehicleTypes.html#a8e3dab2de8d47cbd9d22c8e27af4bacba55b0f71d4462431b819f71266e0044e6',1,'PxVehicleTypes']]],
-  ['enone',['eNONE',['../structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9cab6869486b64cea747bc34cec4403c261',1,'PxControllerDebugRenderFlag::eNONE()'],['../structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a5ebeeface34f3fdc582dffcfb718ec74',1,'PxConverterReportMode::eNONE()'],['../structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6',1,'PxArticulationJointDriveType::eNONE()'],['../structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef',1,'PxConstraintSolveHint::eNONE()'],['../structPxQueryHitType.html#acab1ef408184995d169330ed689b79e3a7e26b51ca324ac45d44d4b70203a2996',1,'PxQueryHitType::eNONE()'],['../structPxPruningStructureType.html#a058454782f6ed1cc953b8d6561b636d9a667e19208a1fd5b2215e452ac9754b6e',1,'PxPruningStructureType::eNONE()']]],
+  ['enone',['eNONE',['../structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9cab6869486b64cea747bc34cec4403c261',1,'PxControllerDebugRenderFlag::eNONE()'],['../structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a5ebeeface34f3fdc582dffcfb718ec74',1,'PxConverterReportMode::eNONE()'],['../structPxConstraintSolveHint.html#afa69114b7731ed70ff8db90bc171f97eaf766284eef4df1c7f5998017fbe767ef',1,'PxConstraintSolveHint::eNONE()'],['../structPxQueryHitType.html#acab1ef408184995d169330ed689b79e3a7e26b51ca324ac45d44d4b70203a2996',1,'PxQueryHitType::eNONE()'],['../structPxPruningStructureType.html#a058454782f6ed1cc953b8d6561b636d9a667e19208a1fd5b2215e452ac9754b6e',1,'PxPruningStructureType::eNONE()']]],
   ['enormal',['eNORMAL',['../structPxConverterReportMode.html#aabe051209d39df379f39bbe45bf4f1a3a74cb273dc39f72e0ae096cc059917946',1,'PxConverterReportMode::eNORMAL()'],['../structPxHitFlag.html#a44c173f6ddf0522ffbd8fa3c585e6b64a5972f664c2fbe85c992fb9aaba46a5cf',1,'PxHitFlag::eNORMAL()']]],
   ['enotify',['eNOTIFY',['../structPxFilterFlag.html#a8de424e04d86b5772436423b0dc58083afbb0680c3ea5447f44fc90560bd7dd8f',1,'PxFilterFlag']]],
   ['enotify_5fcontact_5fpoints',['eNOTIFY_CONTACT_POINTS',['../structPxPairFlag.html#a60e71a2948b030140f840766a3f7ac2fadb30cb0c2d2859560b34c684ac076137',1,'PxPairFlag']]],
@@ -403,7 +403,6 @@ var searchData=
   ['eswing1',['eSWING1',['../structPxD6Axis.html#a0a15eb4ffcf285b3bd8289ddca128880ae9f4bbfb50eaf463e7b1dd406ed1957b',1,'PxD6Axis::eSWING1()'],['../structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a2e1a274c036f8b7846476dbfe2015fec',1,'PxArticulationAxis::eSWING1()']]],
   ['eswing2',['eSWING2',['../structPxD6Axis.html#a0a15eb4ffcf285b3bd8289ddca128880ac6058852beed87da5e9e10f153358f60',1,'PxD6Axis::eSWING2()'],['../structPxArticulationAxis.html#a5c835f1d727bfc738d1fc7a7595d7395a33f154c8d454ab3f08f03f512cc03972',1,'PxArticulationAxis::eSWING2()']]],
   ['etarget',['eTARGET',['../structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e',1,'PxArticulationJointDriveType']]],
-  ['etargetvelocity',['eTARGETVELOCITY',['../structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e',1,'PxArticulationJointDriveType']]],
   ['etemporal_5fbv',['eTEMPORAL_BV',['../structPxControllerDebugRenderFlag.html#a2ca2d1c4bfca5877efb137362de83b9caebb20da3030ba4d86cc768d774fa21ae',1,'PxControllerDebugRenderFlag']]],
   ['etenth',['eTENTH',['../classPxVehicleGearsData.html#a31359110050aad08510b99208a304cb7a476cfd6e181dbe556b5f84b65f3a94b6',1,'PxVehicleGearsData']]],
   ['etgs',['eTGS',['../structPxSolverType.html#a96adc2f8785c6b18805a18b03289dbe6ac62cf152c5d6a5e1a12eebc81b0b6e6d',1,'PxSolverType']]],
diff --git a/physx/documentation/PhysXAPI/files/search/functions_2.js b/physx/documentation/PhysXAPI/files/search/functions_2.js
index 46cd2f34e..c33a989a9 100644
--- a/physx/documentation/PhysXAPI/files/search/functions_2.js
+++ b/physx/documentation/PhysXAPI/files/search/functions_2.js
@@ -8,7 +8,6 @@ var searchData=
   ['clearbit',['clearBit',['../classPxBitAndDataT.html#a8b8421d063a772322282ec752fc07eee',1,'PxBitAndDataT']]],
   ['cleardataptrs',['clearDataPtrs',['../structPxContactPairExtraDataIterator.html#a1d2a22a64b21c547ad7e586307ca3afb',1,'PxContactPairExtraDataIterator']]],
   ['clearforce',['clearForce',['../classPxRigidBody.html#aefe5174aa160a07d488de2a18ba61f94',1,'PxRigidBody']]],
-  ['clearkinematicsurfacevelocity',['clearKinematicSurfaceVelocity',['../classPxRigidDynamic.html#ae6df52fe5966a6b0f421735147b7bf6c',1,'PxRigidDynamic']]],
   ['cleartessflag',['clearTessFlag',['../structPxHeightFieldSample.html#aa28e24b4763400eb5bd84378a23e086f',1,'PxHeightFieldSample']]],
   ['cleartorque',['clearTorque',['../classPxRigidBody.html#ae251132385cc0b082b612d709aa4375d',1,'PxRigidBody']]],
   ['close',['close',['../classPxFileBuf.html#a19a8aa14b29743084e9d8f8e983151ce',1,'PxFileBuf']]],
diff --git a/physx/documentation/PhysXAPI/files/search/functions_d.js b/physx/documentation/PhysXAPI/files/search/functions_d.js
index 67d9748ae..ff1c6e0d6 100644
--- a/physx/documentation/PhysXAPI/files/search/functions_d.js
+++ b/physx/documentation/PhysXAPI/files/search/functions_d.js
@@ -3,7 +3,7 @@ var searchData=
   ['packjointdata',['packJointData',['../classPxArticulationReducedCoordinate.html#a32973e41c51f9dd6ed0168c401d620d7',1,'PxArticulationReducedCoordinate']]],
   ['pairfound',['pairFound',['../classPxSimulationFilterCallback.html#aa434e6ecd4e0cfa7bede6c9ad10d319e',1,'PxSimulationFilterCallback']]],
   ['pairlost',['pairLost',['../classPxSimulationFilterCallback.html#ada6fcf31c0678d3852f926f545884b3e',1,'PxSimulationFilterCallback']]],
-  ['patchuppointers',['patchUpPointers',['../classPxVehicleWheelsSimData.html#a7b605042d1ef0fbd4dd84491e058c1cf',1,'PxVehicleWheelsSimData::patchUpPointers()'],['../classPxVehicleWheelsDynData.html#ae417ce1c175c365484f341dd537dc886',1,'PxVehicleWheelsDynData::patchUpPointers()'],['../classPxVehicleDrive.html#a8548cd5c20014542d09506ee33cba69e',1,'PxVehicleDrive::patchupPointers()'],['../classPxVehicleWheels.html#a07c073158a6df39214f655f59d130b69',1,'PxVehicleWheels::patchupPointers()']]],
+  ['patchuppointers',['patchupPointers',['../classPxVehicleDrive.html#a8548cd5c20014542d09506ee33cba69e',1,'PxVehicleDrive::patchupPointers()'],['../classPxVehicleWheels.html#a07c073158a6df39214f655f59d130b69',1,'PxVehicleWheels::patchupPointers()'],['../classPxVehicleWheelsSimData.html#a7b605042d1ef0fbd4dd84491e058c1cf',1,'PxVehicleWheelsSimData::patchUpPointers()'],['../classPxVehicleWheelsDynData.html#ae417ce1c175c365484f341dd537dc886',1,'PxVehicleWheelsDynData::patchUpPointers()']]],
   ['peek',['peek',['../classPxFileBuf.html#a63e0abac3fa4cf71760142dd7088ef36',1,'PxFileBuf']]],
   ['plane',['plane',['../classPxGeometryHolder.html#ada7be9a86ef1c21903f8a6999449f8c9',1,'PxGeometryHolder::plane()'],['../classPxGeometryHolder.html#a07df9b1058f2d233c8b5ca3de3914040',1,'PxGeometryHolder::plane() const'],['../PxGeometryHelpers_8h.html#a978f4c1b4f2780d4cfb3d4a31ba30f64',1,'plane():&#160;PxGeometryHelpers.h']]],
   ['platformalignedalloc',['platformAlignedAlloc',['../group__extensions.html#gad874c3dcee0d1f3ec91e1880a1f09582',1,'PxDefaultAllocator.h']]],
diff --git a/physx/documentation/PhysXAPI/files/search/functions_f.js b/physx/documentation/PhysXAPI/files/search/functions_f.js
index 848892db2..6571e6ba5 100644
--- a/physx/documentation/PhysXAPI/files/search/functions_f.js
+++ b/physx/documentation/PhysXAPI/files/search/functions_f.js
@@ -30,6 +30,7 @@ var searchData=
   ['setangulardamping',['setAngularDamping',['../classPxRigidBody.html#adb8f357ce212cdf05250237073278d0c',1,'PxRigidBody']]],
   ['setangularvelocity',['setAngularVelocity',['../classPxRigidBody.html#ad49850630db14af26e019d2550ecfd27',1,'PxRigidBody']]],
   ['setantirollbardata',['setAntiRollBarData',['../classPxVehicleWheelsSimData.html#ae8706a8f39a630ba469414fb814fe206',1,'PxVehicleWheelsSimData']]],
+  ['setarticulationflag',['setArticulationFlag',['../classPxArticulationReducedCoordinate.html#aea7152a77f9446960d65359e482595c4',1,'PxArticulationReducedCoordinate']]],
   ['setarticulationflags',['setArticulationFlags',['../classPxArticulationReducedCoordinate.html#ab155a21325dd588b7e354d3a5b1ef564',1,'PxArticulationReducedCoordinate']]],
   ['setautoboxdata',['setAutoBoxData',['../classPxVehicleDriveSimData.html#ac7128888b46d993ff574d7eb3078e36d',1,'PxVehicleDriveSimData']]],
   ['setautoboxswitchtime',['setAutoBoxSwitchTime',['../classPxVehicleDriveDynData.html#a0de28a83d795f037d4042f0a5601087b',1,'PxVehicleDriveDynData']]],
@@ -129,7 +130,6 @@ var searchData=
   ['setinvmassscale0',['setInvMassScale0',['../classPxJoint.html#adf8d09205b60811980a2202f1265562b',1,'PxJoint::setInvMassScale0()'],['../classPxContactSet.html#a74648acc75bce20cfd555e76298b514f',1,'PxContactSet::setInvMassScale0()']]],
   ['setinvmassscale1',['setInvMassScale1',['../classPxJoint.html#a2f5f6c66b5e6933f8578a880a4a957ad',1,'PxJoint::setInvMassScale1()'],['../classPxContactSet.html#ad77be76292fb196ff129d70c372c5d97',1,'PxContactSet::setInvMassScale1()']]],
   ['setjointtype',['setJointType',['../classPxArticulationJointReducedCoordinate.html#ad6983182f5ea8c1e043c0f7e1ec3835b',1,'PxArticulationJointReducedCoordinate']]],
-  ['setkinematicsurfacevelocity',['setKinematicSurfaceVelocity',['../classPxRigidDynamic.html#a7cfb0ebc9f4ae6ec44391b381fe9e371',1,'PxRigidDynamic']]],
   ['setkinematictarget',['setKinematicTarget',['../classPxRigidDynamic.html#a4464d188e7a1e94582c9cf35da9bbc93',1,'PxRigidDynamic']]],
   ['setlatency',['setLatency',['../classPxVehicleAutoBoxData.html#ab58a308ce4750594e3335d953d70281c',1,'PxVehicleAutoBoxData']]],
   ['setlimit',['setLimit',['../classPxPrismaticJoint.html#a7831c090d462d38eb7ba8e2fd25bc751',1,'PxPrismaticJoint::setLimit()'],['../classPxRevoluteJoint.html#ad28e13ae7149b0a09f8b5a9d8beaae60',1,'PxRevoluteJoint::setLimit()'],['../classPxArticulationJointReducedCoordinate.html#aab5482d5d928af3fa8aa6be716079461',1,'PxArticulationJointReducedCoordinate::setLimit()']]],
diff --git a/physx/documentation/PhysXAPI/files/search/variables_10.js b/physx/documentation/PhysXAPI/files/search/variables_10.js
index 45ab36523..7cb244dcf 100644
--- a/physx/documentation/PhysXAPI/files/search/variables_10.js
+++ b/physx/documentation/PhysXAPI/files/search/variables_10.js
@@ -1,7 +1,7 @@
 var searchData=
 [
   ['p',['p',['../classPxTransform.html#ab1a41ac5f4df8e07ba590fa0a8a6e786',1,'PxTransform']]],
-  ['pad',['pad',['../classphysx_1_1Gu_1_1ContactBuffer.html#a4f8fabbb2c9690d8fb30f8b6b1459294',1,'physx::Gu::ContactBuffer::pad()'],['../structPxBatchQueryResult.html#ad0b4a79dc77732dc7bbe5c56f50bd4a6',1,'PxBatchQueryResult::pad()'],['../structPxRigidBodyData.html#a1bb93cacad36fcc17492afe83718f8c7',1,'PxRigidBodyData::pad()'],['../structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be',1,'PxSolverContactDesc::pad()'],['../classPxVehicleChassisData.html#a7e37b2a81f834c76e9b00bf9b7358421',1,'PxVehicleChassisData::pad()']]],
+  ['pad',['pad',['../classphysx_1_1Gu_1_1ContactBuffer.html#a4f8fabbb2c9690d8fb30f8b6b1459294',1,'physx::Gu::ContactBuffer::pad()'],['../structPxBatchQueryResult.html#ad0b4a79dc77732dc7bbe5c56f50bd4a6',1,'PxBatchQueryResult::pad()'],['../structPxRigidBodyData.html#a1bb93cacad36fcc17492afe83718f8c7',1,'PxRigidBodyData::pad()'],['../structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7',1,'PxSolverBodyData::pad()'],['../structPxSolverContactDesc.html#a1dd628330696744829667e0c883ed7be',1,'PxSolverContactDesc::pad()'],['../classPxVehicleChassisData.html#a7e37b2a81f834c76e9b00bf9b7358421',1,'PxVehicleChassisData::pad()']]],
   ['paddingfromflags',['paddingFromFlags',['../classPxConvexMeshGeometry.html#ad2bed15287e31918b5ca0051f0604eeb',1,'PxConvexMeshGeometry::paddingFromFlags()'],['../classPxHeightFieldGeometry.html#ac6320ee337895516d651d46be99217d6',1,'PxHeightFieldGeometry::paddingFromFlags()'],['../classPxTriangleMeshGeometry.html#a114bc6e19dc5bc001ef9cedd1b772a03',1,'PxTriangleMeshGeometry::paddingFromFlags()']]],
   ['padto16bytes',['padTo16Bytes',['../structPxRaycastHit.html#a251f56ec3d168a814467301f61a120c7',1,'PxRaycastHit::padTo16Bytes()'],['../structPxOverlapHit.html#a27b1ad19e9c07c2d6463707b6f18b684',1,'PxOverlapHit::padTo16Bytes()'],['../structPxSweepHit.html#a4bf1a7791ef7aa0d5030fdd2756a607f',1,'PxSweepHit::padTo16Bytes()']]],
   ['pairs',['pairs',['../structPxContactPairHeader.html#a747c8a693f1a7100b0da1e8b431fbb36',1,'PxContactPairHeader']]],
diff --git a/physx/documentation/PhysXAPI/files/search/variables_8.js b/physx/documentation/PhysXAPI/files/search/variables_8.js
index 89a08d918..463146a7d 100644
--- a/physx/documentation/PhysXAPI/files/search/variables_8.js
+++ b/physx/documentation/PhysXAPI/files/search/variables_8.js
@@ -8,7 +8,6 @@ var searchData=
   ['hasfaceindices',['hasFaceIndices',['../structPxContactStreamIterator.html#a0579a7c96e99c4ea9e25ea5c472f87bc',1,'PxContactStreamIterator']]],
   ['hasforcethresholds',['hasForceThresholds',['../structPxSolverContactDesc.html#ae2e4ab03c648da3de0754b5bb20470af',1,'PxSolverContactDesc']]],
   ['hasmaximpulse',['hasMaxImpulse',['../structPxSolverContactDesc.html#ac19bc948958e2f2de3ca214afad28b11',1,'PxSolverContactDesc']]],
-  ['hassurfacevelocity',['hasSurfaceVelocity',['../structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f',1,'PxSolverBodyData']]],
   ['heapcapacity',['heapCapacity',['../structPxgDynamicsMemoryConfig.html#a548423b552a06d77a3c0066d768a4dd8',1,'PxgDynamicsMemoryConfig']]],
   ['height',['height',['../classPxCapsuleControllerDesc.html#a00dcfa7bd1c529bf745fdf8849746a97',1,'PxCapsuleControllerDesc::height()'],['../structPxHeightFieldSample.html#acfdb9fa8a4f56b5c3354cd552768a026',1,'PxHeightFieldSample::height()']]],
   ['heightfield',['heightfield',['../classPxGeometryHolder.html#a13086e315b9865701c5d4c03031723ca',1,'PxGeometryHolder::heightfield()'],['../classPxHeightFieldGeometry.html#ade90caef2f95d56481d625ffba6ad918',1,'PxHeightFieldGeometry::heightField()'],['../PxGeometryHelpers_8h.html#a6e3ac546318539c770d1b4894563d0ae',1,'heightfield():&#160;PxGeometryHelpers.h']]],
diff --git a/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType-members.html b/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType-members.html
index e2224be20..8118d2874 100644
--- a/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType-members.html
+++ b/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType-members.html
@@ -90,10 +90,8 @@
 <p>This is the complete list of members for <a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a>, including all inherited members.</p>
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372">eERROR</a> enum value</td><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></td><td class="entry"></td></tr>
-  <tr><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6">eNONE</a> enum value</td><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">Enum</a> enum name</td><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></td><td class="entry"></td></tr>
-  <tr><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">eTARGET</a> enum value</td><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e">eTARGETVELOCITY</a> enum value</td><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></td><td class="entry"></td></tr>
+  <tr><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">Enum</a> enum name</td><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">eTARGET</a> enum value</td><td class="entry"><a class="el" href="structPxArticulationJointDriveType.html">PxArticulationJointDriveType</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 </div><!-- doc-content -->
 <!-- HTML footer for doxygen 1.8.14-->
diff --git a/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.html b/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.html
index 42256b534..015f9d031 100644
--- a/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.html
+++ b/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.html
@@ -97,10 +97,8 @@
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="pub-types"></a>
 Public Types</h2></td></tr>
-<tr class="memitem:ac6be2c7afe87dd7755dae4aa7a0fdfe2"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">Enum</a> { <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6">eNONE</a> = 0, 
-<a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">eTARGET</a> = 1, 
-<a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372">eERROR</a> = 2, 
-<a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e">eTARGETVELOCITY</a> = 3
+<tr class="memitem:ac6be2c7afe87dd7755dae4aa7a0fdfe2"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2">Enum</a> { <a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e">eTARGET</a> = 0, 
+<a class="el" href="structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372">eERROR</a> = 1
  }</td></tr>
 <tr class="separator:ac6be2c7afe87dd7755dae4aa7a0fdfe2"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
@@ -122,10 +120,8 @@ <h2 class="memtitle"><span class="permalink"><a href="#ac6be2c7afe87dd7755dae4aa
       </table>
 </div><div class="memdoc">
 <table class="fieldtable">
-<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><a id="ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6"></a>eNONE&#160;</td><td class="fielddoc"></td></tr>
-<tr><td class="fieldname"><a id="ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e"></a>eTARGET&#160;</td><td class="fielddoc"></td></tr>
+<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><a id="ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e"></a>eTARGET&#160;</td><td class="fielddoc"></td></tr>
 <tr><td class="fieldname"><a id="ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372"></a>eERROR&#160;</td><td class="fielddoc"></td></tr>
-<tr><td class="fieldname"><a id="ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e"></a>eTARGETVELOCITY&#160;</td><td class="fielddoc"></td></tr>
 </table>
 
 </div>
diff --git a/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.js b/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.js
index c3a7eeb37..aed3b5365 100644
--- a/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.js
+++ b/physx/documentation/PhysXAPI/files/structPxArticulationJointDriveType.js
@@ -1,9 +1,7 @@
 var structPxArticulationJointDriveType =
 [
     [ "Enum", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2", [
-      [ "eNONE", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a900c888f769cd66405c89fe53b61b9d6", null ],
       [ "eTARGET", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a6efbffee5445ed12550a3cb4f1e1cd9e", null ],
-      [ "eERROR", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372", null ],
-      [ "eTARGETVELOCITY", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a32036e88ffe4203f24bcd5a06a58514e", null ]
+      [ "eERROR", "structPxArticulationJointDriveType.html#ac6be2c7afe87dd7755dae4aa7a0fdfe2a178e00c7f3fa1771a6d607235ebb3372", null ]
     ] ]
 ];
\ No newline at end of file
diff --git a/physx/documentation/PhysXAPI/files/structPxSolverBodyData-members.html b/physx/documentation/PhysXAPI/files/structPxSolverBodyData-members.html
index f5579643d..88c9d610b 100644
--- a/physx/documentation/PhysXAPI/files/structPxSolverBodyData-members.html
+++ b/physx/documentation/PhysXAPI/files/structPxSolverBodyData-members.html
@@ -91,11 +91,11 @@
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#a2ab7bc060f7946ae4f3bb1da17018326">angularVelocity</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
   <tr><td class="entry"><a class="el" href="structPxSolverBodyData.html#a1d32e78a191473936a6693b40185ad91">body2World</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f">hasSurfaceVelocity</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
-  <tr><td class="entry"><a class="el" href="structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6">invMass</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415">lockFlags</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
-  <tr><td class="entry"><a class="el" href="structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b">maxContactImpulse</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296">nodeIndex</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6">invMass</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
+  <tr><td class="entry"><a class="el" href="structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415">lockFlags</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b">maxContactImpulse</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
+  <tr><td class="entry"><a class="el" href="structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296">nodeIndex</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7">pad</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
   <tr><td class="entry"><a class="el" href="structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2">penBiasClamp</a></td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structPxSolverBodyData.html#aa443999488f683690658da520f1726c9">projectVelocity</a>(const PxVec3 &amp;lin, const PxVec3 &amp;ang) const</td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr><td class="entry"><a class="el" href="structPxSolverBodyData.html#a5bee162de56be8899aa26c179e753919">PX_ALIGN</a>(16, PxVec3 linearVelocity)</td><td class="entry"><a class="el" href="structPxSolverBodyData.html">PxSolverBodyData</a></td><td class="entry"></td></tr>
diff --git a/physx/documentation/PhysXAPI/files/structPxSolverBodyData.html b/physx/documentation/PhysXAPI/files/structPxSolverBodyData.html
index 5dc988858..0b7581acb 100644
--- a/physx/documentation/PhysXAPI/files/structPxSolverBodyData.html
+++ b/physx/documentation/PhysXAPI/files/structPxSolverBodyData.html
@@ -137,9 +137,9 @@
 <tr class="memitem:a1a9709cbcf35df7551b23ff63b40a415"><td class="memItemLeft" align="right" valign="top">PxU16&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415">lockFlags</a></td></tr>
 <tr class="memdesc:a1a9709cbcf35df7551b23ff63b40a415"><td class="mdescLeft">&#160;</td><td class="mdescRight">110 lock flags  <a href="#a1a9709cbcf35df7551b23ff63b40a415">More...</a><br /></td></tr>
 <tr class="separator:a1a9709cbcf35df7551b23ff63b40a415"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a92113241e1bc8ad1ebffafcd0873410f"><td class="memItemLeft" align="right" valign="top">PxU16&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f">hasSurfaceVelocity</a></td></tr>
-<tr class="memdesc:a92113241e1bc8ad1ebffafcd0873410f"><td class="mdescLeft">&#160;</td><td class="mdescRight">112 surfaceVelocity  <a href="#a92113241e1bc8ad1ebffafcd0873410f">More...</a><br /></td></tr>
-<tr class="separator:a92113241e1bc8ad1ebffafcd0873410f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa406db65fc9a3a26764ffb91dbbad1b7"><td class="memItemLeft" align="right" valign="top">PxU16&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7">pad</a></td></tr>
+<tr class="memdesc:aa406db65fc9a3a26764ffb91dbbad1b7"><td class="mdescLeft">&#160;</td><td class="mdescRight">112 pad  <a href="#aa406db65fc9a3a26764ffb91dbbad1b7">More...</a><br /></td></tr>
+<tr class="separator:aa406db65fc9a3a26764ffb91dbbad1b7"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="aa443999488f683690658da520f1726c9"></a>
@@ -241,22 +241,6 @@ <h2 class="memtitle"><span class="permalink"><a href="#a1d32e78a191473936a6693b4
 
 <p>108 the body's transform </p>
 
-</div>
-</div>
-<a id="a92113241e1bc8ad1ebffafcd0873410f"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a92113241e1bc8ad1ebffafcd0873410f">&#9670;&nbsp;</a></span>hasSurfaceVelocity</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">PxU16 PxSolverBodyData::hasSurfaceVelocity</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-<p>112 surfaceVelocity </p>
-
 </div>
 </div>
 <a id="a668762cdcd8004c19e033952be2206b6"></a>
@@ -321,6 +305,22 @@ <h2 class="memtitle"><span class="permalink"><a href="#ac01024b5307ea57e79c7891d
 
 <p>76 the node idx of this solverBodyData. Used by solver to reference between solver bodies and island bodies. Not required by immediate mode </p>
 
+</div>
+</div>
+<a id="aa406db65fc9a3a26764ffb91dbbad1b7"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa406db65fc9a3a26764ffb91dbbad1b7">&#9670;&nbsp;</a></span>pad</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">PxU16 PxSolverBodyData::pad</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>112 pad </p>
+
 </div>
 </div>
 <a id="ade2d671c9125af9907938a425f6b9af2"></a>
diff --git a/physx/documentation/PhysXAPI/files/structPxSolverBodyData.js b/physx/documentation/PhysXAPI/files/structPxSolverBodyData.js
index c2055ddb1..93cf2cfd1 100644
--- a/physx/documentation/PhysXAPI/files/structPxSolverBodyData.js
+++ b/physx/documentation/PhysXAPI/files/structPxSolverBodyData.js
@@ -4,11 +4,11 @@ var structPxSolverBodyData =
     [ "PX_ALIGN", "structPxSolverBodyData.html#a5bee162de56be8899aa26c179e753919", null ],
     [ "angularVelocity", "structPxSolverBodyData.html#a2ab7bc060f7946ae4f3bb1da17018326", null ],
     [ "body2World", "structPxSolverBodyData.html#a1d32e78a191473936a6693b40185ad91", null ],
-    [ "hasSurfaceVelocity", "structPxSolverBodyData.html#a92113241e1bc8ad1ebffafcd0873410f", null ],
     [ "invMass", "structPxSolverBodyData.html#a668762cdcd8004c19e033952be2206b6", null ],
     [ "lockFlags", "structPxSolverBodyData.html#a1a9709cbcf35df7551b23ff63b40a415", null ],
     [ "maxContactImpulse", "structPxSolverBodyData.html#ac974d6181623739bdc087b02c1dd311b", null ],
     [ "nodeIndex", "structPxSolverBodyData.html#ac01024b5307ea57e79c7891d89906296", null ],
+    [ "pad", "structPxSolverBodyData.html#aa406db65fc9a3a26764ffb91dbbad1b7", null ],
     [ "penBiasClamp", "structPxSolverBodyData.html#ade2d671c9125af9907938a425f6b9af2", null ],
     [ "reportThreshold", "structPxSolverBodyData.html#a74e634af20acbf0a37246a5ec44f2248", null ],
     [ "sqrtInvInertia", "structPxSolverBodyData.html#aeaab06a3dfac94c9ccde5e40f966f8af", null ]
diff --git a/physx/documentation/PhysXGuide/Index.html b/physx/documentation/PhysXGuide/Index.html
index c17d71409..8997dbb5d 100644
--- a/physx/documentation/PhysXGuide/Index.html
+++ b/physx/documentation/PhysXGuide/Index.html
@@ -150,7 +150,7 @@
   <div class="section" id="nvidia-physx-sdk-release-documentation">
 <h1>NVIDIA PhysX SDK 4.0 Documentation<a class="headerlink" href="#nvidia-physx-sdk-release-documentation" title="Permalink to this headline">¶</a></h1>
 <img alt="_images/PhysXLogoBlack.png" src="_images/PhysXLogoBlack.png" />
-<p>Built December 18, 2018</p>
+<p>Built December 19, 2018</p>
 <p>Contents:</p>
 <div class="toctree-wrapper compound">
 <ul>
diff --git a/physx/documentation/PhysXGuide/Manual/Articulations.html b/physx/documentation/PhysXGuide/Manual/Articulations.html
index 9a0ecaa14..59689af75 100644
--- a/physx/documentation/PhysXGuide/Manual/Articulations.html
+++ b/physx/documentation/PhysXGuide/Manual/Articulations.html
@@ -362,6 +362,10 @@ <h2>Articulation Joints<a class="headerlink" href="#id2" title="Permalink to thi
 </div>
 <p>Articulation joints are not breakable, and it is not possible to retrieve the constraint force applied at the joint.</p>
 <p>It should be noted that spherical joints are special-case handled by the reduced coordinate articulations. When there is just 1 degree of freedom, behavior is equivalent to a revolute joint type, but performance will be worse so it is recommended to use revolute joints instead of spherical joints wherever possible. When there are 2 or more degrees of freedom unlocked, rotational motion is integrated by rotating by decomposed quaternions rather than by euler angles to avoid gimbal lock. However, this technique can lead to rotational axis drift, which is corrected by additional constraints in the simulation, which could lead to slight movement on the remaining locked rotational axis.</p>
+<p>The root link in a reduced coordinate articulation can be made to operate like a static body. This is advantageous over constraining the root link using a joint because the immoveable property of the root link is solved perfectly. The following code makes the root link of an articulation be fixed:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">setArticulationFlag</span><span class="p">(</span><span class="n">PxArticulationFlag</span><span class="o">::</span><span class="n">eFIX_BASE</span><span class="p">,</span> <span class="nb">true</span><span class="p">);</span>
+</pre></div>
+</div>
 </div>
 <div class="section" id="joint-positions-and-velocities">
 <h2>Joint positions and velocities<a class="headerlink" href="#joint-positions-and-velocities" title="Permalink to this headline">¶</a></h2>
@@ -369,7 +373,7 @@ <h2>Joint positions and velocities<a class="headerlink" href="#joint-positions-a
 <p>Joint position values are used to integrate the links' poses. The root link's world space pose provides a reference frame from which the poses for all other links in the articulation are calculated. This ensures that there will never be any joint separation or rotations around locked axes.</p>
 <p>The caveat with this approach is that it is not possible to simply update the pose of links in an articulation because this new transform could violate locked axes and would differ from the joint position. Instead, it is necessary to update the joint positions, which triggers new link poses to be computed. This ensures that the internal data from which poses are computed is internally consistent at all times.</p>
 <p>Link velocities and joint velocities share a similar relationship. A link's world-space velocities are derived from the world-space velocity of their parent link and the current joint velocity. This means that the root link stores a world-space velocity and all other links' velocities are computed by propagating this velocity and the respective joint velocities through the articulation. As such, it is not possible to directly set the velocity of links in an articulation. Instead, it is necessary to modify the joint velocities from which the links' velocities are calculated. It is legal to apply world-space impulses on links. These will be propagated through the articulation as part of forward dynamics.</p>
-<p>In addition to applying impulses and setting joint positions and velocities, it is possible to interact with the articulation links through the application of joint forces/torques, which behave like a actuator.</p>
+<p>In addition to applying impulses and setting joint positions and velocities, it is possible to interact with the articulation links through the application of joint forces/torques, which behave like an actuator.</p>
 </div>
 <div class="section" id="pxarticulationcache">
 <h2>PxArticulationCache<a class="headerlink" href="#pxarticulationcache" title="Permalink to this headline">¶</a></h2>
@@ -431,6 +435,82 @@ <h2>PxArticulationCache<a class="headerlink" href="#pxarticulationcache" title="
 <div class="section" id="inverse-dynamics">
 <h2>Inverse Dynamics<a class="headerlink" href="#inverse-dynamics" title="Permalink to this headline">¶</a></h2>
 <p>In addition to forward dynamics, reduced coordinate articulations offer inverse dynamics functionality. This is a suite of utility functions to compute the joint forces required to counteract gravity, coriolis/centrifugal force, external forces and contacts/constraints. In addition, there are utility functions to compute kinematic jacobians, the mass matrix, coefficient matrix and lambda values.</p>
+<p>The following description assumes knowledge of inverse dynamics concepts.</p>
+<p>Prior to performing any inverse dynamics calculations, it is necessary to ensure that constant joint data has been computed. In order to do this, call:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">commonInit</span><span class="p">();</span>
+</pre></div>
+</div>
+<p>The inverse dynamics functions operate on an articulation and a PxArticulationCache. The majority of properties in PxArticulationCache are stored in a reduced/generalized coordinate space, where one entry corresponds to a degree of freedom. To simplify working in this space, PhysX provides the following methods to pack and unpack data to convert between reduced/generalized and maximal coordinates:</p>
+<p>To unpack data from reduced/generalized coordinates to maximal coordinates:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">PxReal</span> <span class="n">maximalJointForces</span><span class="p">[</span><span class="n">NbLinks</span><span class="o">*</span><span class="mi">6</span><span class="p">];</span> <span class="c1">//There are 6 dofs (x,y,z,twist,swing1,swing2) per link</span>
+
+<span class="n">articulation</span><span class="o">-&gt;</span><span class="n">unpackJointData</span><span class="p">(</span><span class="n">cache</span><span class="o">-&gt;</span><span class="n">jointForces</span><span class="p">,</span> <span class="n">maximalJointForces</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>To pack data from maximal coordinates to reduced/generalized coordinates:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">packJointData</span><span class="p">(</span><span class="n">maximalJointForces</span><span class="p">,</span> <span class="n">cache</span><span class="o">-&gt;</span><span class="n">jointForces</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>The following method computes the joint force required to counteract gravity and deposits the set of joint forces in the force buffer in a PxArticulationCache:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeGeneralizedGravityForce</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>To compute the joint force required to counteract coriolis and centrifugal force, we must first provide the joint velocities of the articulation because coriolis/centrifugal forces are dependent on those values. In this example, we extract the velocities from the articulation before computing coriolis/centrifugal force:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">copyInternalStateToCache</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">,</span> <span class="n">PxArticulationCache</span><span class="o">::</span><span class="n">eVELOCTY</span><span class="p">);</span>
+<span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeCoriolisAndCentrifugalForce</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>To compute the joint acceleration for the articulation, call:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeJointAcceleration</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>Joint acceleration is dependent on gravity and current link velocity. The resulting acceleration can be used to compute a set of joint forces capable of maintaining the current joint velocities.</p>
+<p>To convert a joint acceleration into a joint velocity. Input is cache.jointAcceleration, output is cache.jointForce:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeJointForce</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>The generalized mass matrix can be computed using the following method. Output is placed in cache.massMatrix. This matrix represents the joint space inertia of the articulation and can be used to convert joint accelerations into joint forces/torques:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeGeneralizedMassMatrix</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>Articulations are tree structures and do not support closed loops natively. PhysX can simulate closed loop systems by the use of joints. Additionally, contacts between links and other rigid bodies (e.g. the ground) can form loops if more than one link has contacts. These joints and contacts are solved by the rigid body solver during PhysX simulation, but it is often desirable to factor these constraints into inverse dynamics.</p>
+<p>To facilitate this, PhysX provides a mechanism to register loop constraints with the articulation:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">PxJoint</span><span class="o">*</span> <span class="n">joint</span> <span class="o">=</span> <span class="p">...;</span>
+
+<span class="n">articulation</span><span class="o">-&gt;</span><span class="n">addLoopJoint</span><span class="p">(</span><span class="n">joint</span><span class="p">);</span>
+
+<span class="n">articulation</span><span class="o">-&gt;</span><span class="n">removeLoopJoint</span><span class="p">(</span><span class="n">joint</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>In order to represent contacts, use PxContactJoint : this is a new addition specifically for inverse dynamics that represents a contact as an extension to the joint system. Currently, the following features are restricted to the use of PxContactJoint. Support for additional joint types will be added in the near future.</p>
+<p>Inverse dynamics provides functionality to calculate the coefficient matrix. This matrix is an NxM matrix, where N is the number of degrees of freedom in the articulation and M is the number constraint rows. This matrix represents the joint force caused by a unit impulse applied to each constraint:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">PxU32</span> <span class="n">coefficientMatrixSize</span> <span class="o">=</span> <span class="n">articulation</span><span class="o">-&gt;</span><span class="n">getCoefficentMatrixSize</span><span class="p">();</span>
+<span class="n">PxReal</span><span class="o">*</span> <span class="n">coefficientMatrix</span> <span class="o">=</span> <span class="k">new</span> <span class="n">PxReal</span><span class="p">[</span><span class="n">coefficientMatrixSize</span><span class="p">];</span>
+<span class="n">cache</span><span class="o">-&gt;</span><span class="n">coefficientMatrix</span> <span class="o">=</span> <span class="n">coefficientMatrix</span><span class="p">;</span>
+<span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeCoefficientMatrix</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>The coefficient matrix can be used to convert a set of lambda values (impulses applied by the respective constraints) into a set of joint forces to counteract these impulses. However, the lambda values are only known after a frame's simulation occurs, so it may be necessary to know these lambda values before they are applied in the solver. However, even if these lambda values are known ahead-of-time, applying a counteracting force may not yield the desired results because the act of applying additional force on the joints may influence the lambda values, resulting in a different set of joint forces.</p>
+<p>In order to overcome this feedback-loop, inverse dynamics provides a function to compute the lambda values for the loop constraints, assuming that the joint forces required to counteract these lambdas are also applied. This technique is an iterative process that converges on a stable set of joint forces.</p>
+<p>First, we must allocate sufficient space for the lambda values</p>
+<blockquote>
+<div>PxReal* lambda = new PxReal[articulation-&gt;getNbLoopJoints()];
+cache-&gt;lambda = lambda;</div></blockquote>
+<p>In order to compute the lambda values, we need to have 2 caches: one to store the initial state, and one to calculate the final state:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">PxArticulationCache</span><span class="o">*</span> <span class="n">initialState</span> <span class="o">=</span> <span class="n">articulation</span><span class="o">-&gt;</span><span class="n">createCache</span><span class="p">()</span>
+<span class="n">articulation</span><span class="o">-&gt;</span><span class="n">copyInternalStateToCache</span><span class="p">(</span><span class="o">*</span><span class="n">initialState</span><span class="p">,</span> <span class="n">PxArticulationCache</span><span class="o">::</span><span class="n">eALL</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>Now we can compute the joint forces values. In addition, we must compute the internal and external joint forces caused by gravity and coriolis/centrifugal force to ensure that we converge on a result that will match forward dynamics. The joint force values for internal/external accelerations can be calculated using the methods outlined earlier. The method is iterative and terminates either after reaching convergence or when a maximum number of iterations is run (in this case 200). Resulting joint force is output in cache.jointForce:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="k">const</span> <span class="n">PxU32</span> <span class="n">maxIterations</span> <span class="o">=</span> <span class="mi">200</span><span class="p">;</span>
+
+<span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeLambda</span><span class="p">(</span><span class="o">*</span><span class="n">cache</span><span class="p">,</span> <span class="o">*</span><span class="n">initialState</span><span class="p">,</span> <span class="n">jointForces</span><span class="p">,</span> <span class="n">maxIterations</span><span class="p">);</span>
+</pre></div>
+</div>
+<p>The kinematic jacobian for an end effector can be computed using the following method. This provides the jacobian matrix for each joint from the end effector to the root link in world space:</p>
+<div class="highlight-c++"><div class="highlight"><pre><span class="n">articulation</span><span class="o">-&gt;</span><span class="n">computeKinematicJacobian</span><span class="p">(</span><span class="n">linkID</span><span class="p">,</span> <span class="o">*</span><span class="n">cache</span><span class="p">);</span>
+</pre></div>
+</div>
 </div>
 </div>
 
diff --git a/physx/documentation/PhysXGuide/Manual/GeometryQueries.html b/physx/documentation/PhysXGuide/Manual/GeometryQueries.html
index 9622cb024..01515bbe5 100644
--- a/physx/documentation/PhysXGuide/Manual/GeometryQueries.html
+++ b/physx/documentation/PhysXGuide/Manual/GeometryQueries.html
@@ -251,7 +251,6 @@ <h2>Raycasts<a class="headerlink" href="#raycasts" title="Permalink to this head
 <li><em>hitFlags</em> specifies the values that should be returned by the query, and options for processing the query.</li>
 <li><em>maxHits</em> is the maximum number of hits to return.</li>
 <li><em>hitInfo</em> specifies the <em>PxRaycastHit</em> structure(s) into which the raycast results will be stored.</li>
-<li>The <em>anyHit</em> parameter is <strong>deprecated</strong>. It is equivalent to <em>PxHitFlag::eMESH_ANY</em>, which should be used instead.</li>
 </ul>
 <p>The returned result is the number of intersections found. For each intersection, a <em>PxRaycastHit</em> is populated. The fields of this structure are as follows:</p>
 <div class="highlight-c++"><div class="highlight"><pre><span class="n">PxRigidActor</span><span class="o">*</span>   <span class="n">actor</span><span class="p">;</span>
diff --git a/physx/documentation/PhysXGuide/Manual/Index.html b/physx/documentation/PhysXGuide/Manual/Index.html
index 8feed8011..2d11127bd 100644
--- a/physx/documentation/PhysXGuide/Manual/Index.html
+++ b/physx/documentation/PhysXGuide/Manual/Index.html
@@ -187,7 +187,7 @@
   <div class="section" id="user-s-guide">
 <h1>User's Guide<a class="headerlink" href="#user-s-guide" title="Permalink to this headline">¶</a></h1>
 <img alt="../_images/PhysXLogoBlack1.png" src="../_images/PhysXLogoBlack1.png" />
-<p>Built December 18, 2018</p>
+<p>Built December 19, 2018</p>
 <p>Contents:</p>
 <div class="toctree-wrapper compound">
 <ul>
diff --git a/physx/documentation/PhysXGuide/Manual/MigrationTo40.html b/physx/documentation/PhysXGuide/Manual/MigrationTo40.html
index 8b0378581..f9b4300d3 100644
--- a/physx/documentation/PhysXGuide/Manual/MigrationTo40.html
+++ b/physx/documentation/PhysXGuide/Manual/MigrationTo40.html
@@ -268,6 +268,9 @@ <h2>Core PhysX<a class="headerlink" href="#core-physx" title="Permalink to this
 <li>PxConvexMeshCookingType::eINFLATION_INCREMENTAL_HULL and PxCookingParams::skinWidth have been removed. Please use the regular PxShape::setRestOffset.</li>
 <li>PxBVH34MidphaseDesc::numTrisPerLeaf has been renamed to PxBVH34MidphaseDesc::numPrimsPerLeaf</li>
 <li>PxFoundationDelayLoadHook has been merged into PxDelayLoadHook. The per build configuration callbacks for PxDelayLoadHook and PxGpuLoadHook have been removed. The application provided implementations of PxDelayLoadHook::getPhysXFoundationDllName, PxDelayLoadHook::getPhysXCommonDllName and PxGpuLoadHook::getPhysXGpuDllName now need to provide the correct DLL paths/names based on the application build configurations.</li>
+<li>A common base class for PxArticulation and PxArticulationReducedCoordinate was introduced: PxArticulationBase. This class encapsulates common functionality between both articulation implementations. API interfaces previously taking/returning a PxArticulation will now take/return PxArticulationBase. The user must statically up-cast to the correct type if non-common functionality is required.</li>
+<li>A common base class for PxArticulationJoint and PxArticulationJointReducedCoordinate was introduced: PxArticulationJointBase. This class encapsulates common functionality between both reduced and legacy articulations. API interfaces previously taking/return a PxArticulationJoint will now take/return PxArticulationJointBase. The user must statically up-cast to the correct type if implementation-specific functionality is required.</li>
+<li>Getting/setting linear damping, angular damping and max angular velocity has been migrated from PxRigidDynamic to PxRigidBody to permit usage of these methods with the new reduced coordinate articulations.</li>
 </ul>
 </div>
 <div class="section" id="physx-extensions">
diff --git a/physx/documentation/PhysXGuide/Manual/RigidBodyCollision.html b/physx/documentation/PhysXGuide/Manual/RigidBodyCollision.html
index 695dca838..b567c3370 100644
--- a/physx/documentation/PhysXGuide/Manual/RigidBodyCollision.html
+++ b/physx/documentation/PhysXGuide/Manual/RigidBodyCollision.html
@@ -242,10 +242,6 @@ <h2>Introduction<a class="headerlink" href="#introduction" title="Permalink to t
 <div class="highlight-c++"><div class="highlight"><pre><span class="n">myActor</span><span class="p">.</span><span class="n">detachShape</span><span class="p">(</span><span class="o">*</span><span class="n">shape</span><span class="p">);</span>
 </pre></div>
 </div>
-<div class="admonition note">
-<p class="first admonition-title">Note</p>
-<p class="last">in previous versions of PhysX, release() was used to detach a shape from its actor and destroy it. This use of release() is deprecated in PhysX 3.3 and will not be supported in futures version of PhysX.</p>
-</div>
 </div>
 <div class="section" id="simulation-shapes-and-scene-query-shapes">
 <h2>Simulation Shapes and Scene Query Shapes<a class="headerlink" href="#simulation-shapes-and-scene-query-shapes" title="Permalink to this headline">¶</a></h2>
diff --git a/physx/documentation/PhysXGuide/_sources/Manual/Articulations.txt b/physx/documentation/PhysXGuide/_sources/Manual/Articulations.txt
index f8ba2edbe..6f8e0797c 100644
--- a/physx/documentation/PhysXGuide/_sources/Manual/Articulations.txt
+++ b/physx/documentation/PhysXGuide/_sources/Manual/Articulations.txt
@@ -192,6 +192,10 @@ Articulation joints are not breakable, and it is not possible to retrieve the co
 
 It should be noted that spherical joints are special-case handled by the reduced coordinate articulations. When there is just 1 degree of freedom, behavior is equivalent to a revolute joint type, but performance will be worse so it is recommended to use revolute joints instead of spherical joints wherever possible. When there are 2 or more degrees of freedom unlocked, rotational motion is integrated by rotating by decomposed quaternions rather than by euler angles to avoid gimbal lock. However, this technique can lead to rotational axis drift, which is corrected by additional constraints in the simulation, which could lead to slight movement on the remaining locked rotational axis.
 
+The root link in a reduced coordinate articulation can be made to operate like a static body. This is advantageous over constraining the root link using a joint because the immoveable property of the root link is solved perfectly. The following code makes the root link of an articulation be fixed::
+
+	articulation->setArticulationFlag(PxArticulationFlag::eFIX_BASE, true);
+
 Joint positions and velocities
 ==============================
 
@@ -203,7 +207,7 @@ The caveat with this approach is that it is not possible to simply update the po
 
 Link velocities and joint velocities share a similar relationship. A link's world-space velocities are derived from the world-space velocity of their parent link and the current joint velocity. This means that the root link stores a world-space velocity and all other links' velocities are computed by propagating this velocity and the respective joint velocities through the articulation. As such, it is not possible to directly set the velocity of links in an articulation. Instead, it is necessary to modify the joint velocities from which the links' velocities are calculated. It is legal to apply world-space impulses on links. These will be propagated through the articulation as part of forward dynamics.
 
-In addition to applying impulses and setting joint positions and velocities, it is possible to interact with the articulation links through the application of joint forces/torques, which behave like a actuator.
+In addition to applying impulses and setting joint positions and velocities, it is possible to interact with the articulation links through the application of joint forces/torques, which behave like an actuator.
 
 
 PxArticulationCache
@@ -276,3 +280,86 @@ Inverse Dynamics
 
 In addition to forward dynamics, reduced coordinate articulations offer inverse dynamics functionality. This is a suite of utility functions to compute the joint forces required to counteract gravity, coriolis/centrifugal force, external forces and contacts/constraints. In addition, there are utility functions to compute kinematic jacobians, the mass matrix, coefficient matrix and lambda values.
 
+The following description assumes knowledge of inverse dynamics concepts.
+
+Prior to performing any inverse dynamics calculations, it is necessary to ensure that constant joint data has been computed. In order to do this, call::
+
+	articulation->commonInit();
+
+The inverse dynamics functions operate on an articulation and a PxArticulationCache. The majority of properties in PxArticulationCache are stored in a reduced/generalized coordinate space, where one entry corresponds to a degree of freedom. To simplify working in this space, PhysX provides the following methods to pack and unpack data to convert between reduced/generalized and maximal coordinates::
+
+To unpack data from reduced/generalized coordinates to maximal coordinates::
+
+	PxReal maximalJointForces[NbLinks*6]; //There are 6 dofs (x,y,z,twist,swing1,swing2) per link
+
+	articulation->unpackJointData(cache->jointForces, maximalJointForces);
+
+To pack data from maximal coordinates to reduced/generalized coordinates::
+
+	articulation->packJointData(maximalJointForces, cache->jointForces);
+
+The following method computes the joint force required to counteract gravity and deposits the set of joint forces in the force buffer in a PxArticulationCache::
+	
+	articulation->computeGeneralizedGravityForce(*cache);
+
+To compute the joint force required to counteract coriolis and centrifugal force, we must first provide the joint velocities of the articulation because coriolis/centrifugal forces are dependent on those values. In this example, we extract the velocities from the articulation before computing coriolis/centrifugal force::
+
+	articulation->copyInternalStateToCache(*cache, PxArticulationCache::eVELOCTY);
+	articulation->computeCoriolisAndCentrifugalForce(*cache);
+
+To compute the joint acceleration for the articulation, call::
+
+	articulation->computeJointAcceleration(*cache);
+
+Joint acceleration is dependent on gravity and current link velocity. The resulting acceleration can be used to compute a set of joint forces capable of maintaining the current joint velocities.
+
+To convert a joint acceleration into a joint velocity. Input is cache.jointAcceleration, output is cache.jointForce::
+
+	articulation->computeJointForce(*cache);
+
+The generalized mass matrix can be computed using the following method. Output is placed in cache.massMatrix. This matrix represents the joint space inertia of the articulation and can be used to convert joint accelerations into joint forces/torques::
+	
+	articulation->computeGeneralizedMassMatrix(*cache);
+
+Articulations are tree structures and do not support closed loops natively. PhysX can simulate closed loop systems by the use of joints. Additionally, contacts between links and other rigid bodies (e.g. the ground) can form loops if more than one link has contacts. These joints and contacts are solved by the rigid body solver during PhysX simulation, but it is often desirable to factor these constraints into inverse dynamics.
+
+To facilitate this, PhysX provides a mechanism to register loop constraints with the articulation::
+
+	PxJoint* joint = ...;
+
+	articulation->addLoopJoint(joint);
+
+	articulation->removeLoopJoint(joint);
+
+In order to represent contacts, use PxContactJoint : this is a new addition specifically for inverse dynamics that represents a contact as an extension to the joint system. Currently, the following features are restricted to the use of PxContactJoint. Support for additional joint types will be added in the near future.
+
+Inverse dynamics provides functionality to calculate the coefficient matrix. This matrix is an NxM matrix, where N is the number of degrees of freedom in the articulation and M is the number constraint rows. This matrix represents the joint force caused by a unit impulse applied to each constraint::
+
+	PxU32 coefficientMatrixSize = articulation->getCoefficentMatrixSize();
+	PxReal* coefficientMatrix = new PxReal[coefficientMatrixSize];
+	cache->coefficientMatrix = coefficientMatrix;
+	articulation->computeCoefficientMatrix(*cache);
+
+The coefficient matrix can be used to convert a set of lambda values (impulses applied by the respective constraints) into a set of joint forces to counteract these impulses. However, the lambda values are only known after a frame's simulation occurs, so it may be necessary to know these lambda values before they are applied in the solver. However, even if these lambda values are known ahead-of-time, applying a counteracting force may not yield the desired results because the act of applying additional force on the joints may influence the lambda values, resulting in a different set of joint forces.
+
+In order to overcome this feedback-loop, inverse dynamics provides a function to compute the lambda values for the loop constraints, assuming that the joint forces required to counteract these lambdas are also applied. This technique is an iterative process that converges on a stable set of joint forces.
+
+First, we must allocate sufficient space for the lambda values 
+
+	PxReal* lambda = new PxReal[articulation->getNbLoopJoints()];
+	cache->lambda = lambda;
+
+In order to compute the lambda values, we need to have 2 caches: one to store the initial state, and one to calculate the final state::
+
+	PxArticulationCache* initialState = articulation->createCache()
+	articulation->copyInternalStateToCache(*initialState, PxArticulationCache::eALL);
+
+Now we can compute the joint forces values. In addition, we must compute the internal and external joint forces caused by gravity and coriolis/centrifugal force to ensure that we converge on a result that will match forward dynamics. The joint force values for internal/external accelerations can be calculated using the methods outlined earlier. The method is iterative and terminates either after reaching convergence or when a maximum number of iterations is run (in this case 200). Resulting joint force is output in cache.jointForce::
+
+	const PxU32 maxIterations = 200;
+		
+	articulation->computeLambda(*cache, *initialState, jointForces, maxIterations);
+
+The kinematic jacobian for an end effector can be computed using the following method. This provides the jacobian matrix for each joint from the end effector to the root link in world space::
+
+	articulation->computeKinematicJacobian(linkID, *cache);
diff --git a/physx/documentation/PhysXGuide/_sources/Manual/GeometryQueries.txt b/physx/documentation/PhysXGuide/_sources/Manual/GeometryQueries.txt
index e09f9d5c9..3f5e5d966 100644
--- a/physx/documentation/PhysXGuide/_sources/Manual/GeometryQueries.txt
+++ b/physx/documentation/PhysXGuide/_sources/Manual/GeometryQueries.txt
@@ -50,7 +50,6 @@ The arguments are interpreted as follows:
 * *hitFlags* specifies the values that should be returned by the query, and options for processing the query.
 * *maxHits* is the maximum number of hits to return.
 * *hitInfo* specifies the *PxRaycastHit* structure(s) into which the raycast results will be stored.
-* The *anyHit* parameter is **deprecated**. It is equivalent to *PxHitFlag::eMESH_ANY*, which should be used instead.
 
 The returned result is the number of intersections found. For each intersection, a *PxRaycastHit* is populated. The fields of this structure are as follows::
 
diff --git a/physx/documentation/PhysXGuide/_sources/Manual/MigrationTo40.txt b/physx/documentation/PhysXGuide/_sources/Manual/MigrationTo40.txt
index 092b52ab8..64a29639a 100644
--- a/physx/documentation/PhysXGuide/_sources/Manual/MigrationTo40.txt
+++ b/physx/documentation/PhysXGuide/_sources/Manual/MigrationTo40.txt
@@ -90,6 +90,9 @@ Core PhysX
 * PxConvexMeshCookingType::eINFLATION_INCREMENTAL_HULL and PxCookingParams::skinWidth have been removed. Please use the regular PxShape::setRestOffset.
 * PxBVH34MidphaseDesc::numTrisPerLeaf has been renamed to PxBVH34MidphaseDesc::numPrimsPerLeaf
 * PxFoundationDelayLoadHook has been merged into PxDelayLoadHook. The per build configuration callbacks for PxDelayLoadHook and PxGpuLoadHook have been removed. The application provided implementations of PxDelayLoadHook::getPhysXFoundationDllName, PxDelayLoadHook::getPhysXCommonDllName and PxGpuLoadHook::getPhysXGpuDllName now need to provide the correct DLL paths/names based on the application build configurations.
+* A common base class for PxArticulation and PxArticulationReducedCoordinate was introduced: PxArticulationBase. This class encapsulates common functionality between both articulation implementations. API interfaces previously taking/returning a PxArticulation will now take/return PxArticulationBase. The user must statically up-cast to the correct type if non-common functionality is required.
+* A common base class for PxArticulationJoint and PxArticulationJointReducedCoordinate was introduced: PxArticulationJointBase. This class encapsulates common functionality between both reduced and legacy articulations. API interfaces previously taking/return a PxArticulationJoint will now take/return PxArticulationJointBase. The user must statically up-cast to the correct type if implementation-specific functionality is required.
+* Getting/setting linear damping, angular damping and max angular velocity has been migrated from PxRigidDynamic to PxRigidBody to permit usage of these methods with the new reduced coordinate articulations.
 
 =====================================================
 PhysX Extensions
diff --git a/physx/documentation/PhysXGuide/_sources/Manual/RigidBodyCollision.txt b/physx/documentation/PhysXGuide/_sources/Manual/RigidBodyCollision.txt
index 919c03da0..103ffca9b 100644
--- a/physx/documentation/PhysXGuide/_sources/Manual/RigidBodyCollision.txt
+++ b/physx/documentation/PhysXGuide/_sources/Manual/RigidBodyCollision.txt
@@ -50,9 +50,6 @@ Detach the shape from the actor as follows::
 
     myActor.detachShape(*shape);
 
-.. note:: in previous versions of PhysX, release() was used to detach a shape from its actor and destroy it. This use of release() is deprecated in PhysX 3.3 and will not be supported in futures version of PhysX.
-
-
 ========================================
 Simulation Shapes and Scene Query Shapes
 ========================================
diff --git a/physx/documentation/PhysXGuide/objects.inv b/physx/documentation/PhysXGuide/objects.inv
index 775776c16..c01d1949b 100644
Binary files a/physx/documentation/PhysXGuide/objects.inv and b/physx/documentation/PhysXGuide/objects.inv differ
diff --git a/physx/documentation/PhysXGuide/searchindex.js b/physx/documentation/PhysXGuide/searchindex.js
index c04b2b036..240386aee 100644
--- a/physx/documentation/PhysXGuide/searchindex.js
+++ b/physx/documentation/PhysXGuide/searchindex.js
@@ -1 +1 @@
-Search.setIndex({titleterms:{other:[19,25],than:18,pxvehicleantirollbar:30,process:11,pruner:12,snippet:[19,5,30],split:25,registr:26,origin:31,object:[17,18,19,21,8,11,12,28],connect:[32,17],sim:25,synchron:25,sphere:[1,26],region:29,hard:0,jitteri:30,serializ:19,anisotrop:23,skin:23,brief:8,step:[18,23],differ:17,narrow:12,feel:30,realist:12,convert:19,cast:[23,17],platform:19,actor:[18,23,16],index:23,debug:[12,2,23],specul:7,nxutillib:23,header:[23,3],bar:30,swap:30,surfac:[16,30],refer:[19,17,30,6],extern:32,count:[19,17,6],how:11,offset:18,wall:18,basic:[0,5,32,11,23],modif:[26,7],consid:12,"function":[0,1,11],too:30,best:[19,0,21,12],corner:30,"class":[14,23,27,17],api:[14,17,19,6,23,24,27],data:[32,0,21,7,25,26,19,12],inform:7,sluggish:30,collect:19,pxvehicletireloadfilterdata:30,deform:[23,26],properti:16,delai:4,unbound:11,first:[19,30],tessel:18,callback:[23,18,12,29,25,11],remov:[23,21],higher:18,extract:7,"16k":25,rigid:[29,28,12,16,9],task:[25,21],check:12,triangl:[29,1,26],cook:[6,4,19,23,26,12],sampl:12,option:4,immedi:16,precis:1,heighfield:29,prealloc:12,math:[14,17],spheric:0,pxvehicledifferential4wdata:30,behavior:[18,12,7,0],particl:[24,6],edeprecated_trigger_trigger_report:6,refus:30,against:1,anti:30,gpu:[9,12],point:26,bottleneck:12,rate:12,consider:[9,21],detect:[23,7],secur:4,distanc:0,physxfound:4,ani:11,overview:[8,28,27],troubleshoot:[18,19,30],common:19,estat:11,distribut:24,interact:[18,29],vector:18,box:[1,26],runtim:[18,12],mass:[0,16],extra:7,revolut:0,scale:[0,26],stack:12,polygon:26,shutdown:[4,23],shut:4,veri:12,ccd:[14,7],rebuild:12,updat:[18,30],height:[1,26],unit:[30,17],under:30,engin:30,mid:12,seidel:16,insert:[29,12],setup:32,complet:[19,25],provid:26,batch:[14,11],access:21,creation:[23,30],algorithm:[29,30,26,12],heightfield:[12,1,26,23],advanc:[30,7,32],bewar:12,pxvehicledrive4w:30,compon:4,redistribut:[23,3],introduct:[1,2,30,9,4,19,17,21,29,11,18,27,28,26,12],specif:19,repx:[14,19,27],determin:[12,16,17],snippetvehiclenodr:30,limit:[12,25,26,7,0],iter:16,beyond:30,featur:23,intern:18,place:25,quickli:30,guidelin:21,earli:12,order:30,debugg:[14,5,32,6],tempor:16,slowli:30,scene:[14,6,12,21,29,25,11,23,31],hidden:18,taskmanag:[14,21],telemetri:30,cpudispatch:21,custom:[0,32,27,3],profil:[12,32],correct:30,amort:29,preprocess:0,multithread:[23,21],tweak:12,well:12,tabl:0,serial:[14,19,25,27,23],getactor:12,manifold:7,buffer:[12,25,21],startup:[4,23],convex:[12,1,26],pxmeshqueri:1,perform:[18,9,7,12],merg:11,version:[19,30],event:[16,21],friction:[23,16,30],acceler:[9,30],bodi:[29,28,12,16,9],physx:[14,32,6,19,5,17,15,8,23,24,10,27,3],core:[14,24,6],state:[16,30],error:[12,23,32,17],migrat:[14,23,24,6],roll:30,slow:[12,30],detail:30,deprec:[24,6],dynam:[18,12,16,20],ground:12,over:[18,30],physic:[4,12],involv:21,integr:25,pxshare:24,tree:12,applic:[19,17],stream:[12,32],torqu:[16,30],phase:[29,12],name:19,valu:18,forward:30,user:[8,11,13,17],implic:[12,7],upgrad:19,pcm:7,pxscenequeryupdatemod:11,collis:[29,7,23],invers:20,snippetvehicle4w:30,level:[19,30],lightcputask:21,maxim:20,move:[18,30],geometri:[18,19,1,26],goe:[18,30],edynamic_aabb_tre:11,tank:30,curv:30,chang:[23,16],memori:[17,19,21,23,25,12],reduc:[12,20],prepar:0,actuat:0,singl:11,statist:22,jitter:12,appli:16,much:30,hierarchi:23,pxgeometryhold:26,pitfal:1,contact:[30,26,7,6],obstacl:18,render:[8,23],ackermann:30,incomplet:19,pile:12,shader:[0,30],implement:21,eno_block:11,basetask:21,articul:20,snippethelloworld:5,paramet:[12,23],indic:26,report:[0,23,7,17],"final":12,requir:19,solver:[0,16],invalid:18,tool:19,multipl:[30,11,21],continu:[23,7],pxextens:6,snippetloadcollect:19,extrapol:30,unnatur:30,pxsceneflag:6,everyth:19,type:[23,30,26,17],usag:[12,2,22],materi:23,pxscene:12,fix:[0,11],car:30,gauss:16,coordin:[12,20],setupvehicleactor:30,pxarticulationcach:20,popul:29,recoveri:18,capsul:[18,1,26],graviti:16,alloc:12,part:18,doe:30,visual:[14,32,2,6,12,5,8,0],side:32,never:30,shift:31,doubl:21,practic:[19,21,12],pxvehicleackermanngeometrydata:30,interfac:22,filter:[29,30,11,23],differenti:30,document:10,kinemat:[18,29,16,12],binari:[14,19,27],overlap:[18,12,1,11,23],clutch:30,pxvehiclewheeldata:30,vertex:26,unlock:0,more:30,assert:17,system:[12,27],pxfilterdata:11,axi:16,compart:23,notif:7,quickhul:26,snippetconvert:19,subordin:19,foundat:4,joint:[12,23,20,0],releas:[29,12],welcom:8,loop:28,activ:16,dll:4,posit:20,proper:12,pxvehiclesuspensiondata:30,time:[18,23],descript:23,network:32,snippetvehiclescal:30,mesh:[29,1,26,12],pose:[23,30],down:[4,30],build:[12,24,3],forc:[0,16],cct:18,thread:[12,25,21,30],hit:[18,11],issu:[18,12],tight:12,semant:21,pxvehicleenginedata:30,pxpruningstructur:11,charact:[14,12,18],discuss:19,depth:[19,1,12],sdk:[14,32,6,4,23,24,10,30],comput:12,penetr:[12,1],load:4,closest:11,enon:11,pair:12,gener:[26,30,11,3,6],rotat:30,edynam:11,estim:30,design:23,eany_hit:11,project:[0,20,3],pvd:[19,25,32],interest:29,physxcommon:4,pxqueryflag:11,unstabl:12,domin:16,pxvehicleclutchdata:30,epostfilt:11,thi:8,world:8,scenequeri:32,asynchron:21,enhanc:16,pxvehiclenodr:30,snippetvehicletank:30,estatic_aabb_tre:11,convent:23,quantiz:12,identifi:12,pxvehicledrivenw:30,wheel:30,flag:[12,32],substep:25,about:8,pxrigiddynam:30,aggreg:29,block:[25,11],auto:18,pxvehiclewheelssimdata:30,cmake:[3,24],setupdrivesimdata:30,veloc:[16,20],"case":[18,19],queri:[14,6,12,1,29,11,23,30],tire:30,persist:7,licens:15,mode:[18,16,30,11],pvdclient:32,code:[19,30],test:[12,23],snippetseri:19,pxvehiclegearsdata:30,disabl:[12,30],onli:26,vehicl:[14,30],constraint:0,field:[1,26],despair:12,pxrigidactor:6,breakag:0,pxgeometryqueri:1,cach:[18,19,11],restitut:16,soft:0,nvidia:10,fetchresult:25,cloth:[14,24],file:[3,32],flush:12,lock:[0,16,21],pxpruningstructuretyp:11,metadata:27,scratch:[12,25],trigger:[29,7],bound:12,graphic:18,environ:12,manag:[18,19,23,0,17],drivabl:30,climb:[18,30],preset:3,kill:12,snippetvehiclemultithread:30,spin:[12,30],sequenc:25,prismat:0,turn:30,eprefilt:11,pxvehicledrivetank:30,pxvehicleautoboxdata:30,modul:18,configur:[0,3],tune:[9,30,7],what:[9,5],pxbvhstructur:11,retarget:19,creat:[18,29,20,11],extens:[4,14,24,23],rare:18,extend:[32,27],sweep:[23,1,11],steer:30,steep:30,setupwheelssimulationdata:30,sleep:[12,16,20],through:18,broad:[29,12],helloworld:5,explod:12,deseri:19,multi:[12,30],model:[28,30],when:[12,30],face:26,defin:[0,11],control:[14,12,30,18],guid:[8,12,30,13],volum:18,touch:11,simul:[23,12,21,22,29,25,28],width:23,reconnect:19,shape:[29,23,7,12],concept:30,pxdelayloadhook:4,result:11,pxvehicletiredata:30,pxgpuloadhook:4,from:[14,23,24,21,6],addit:1,slope:30,invis:18,spu:14,initi:[30,1],share:12,createshap:6,drive:[0,20],raycast:[23,1,11,7,30],walkabl:18,plane:[29,1,26],gear:30},objects:{},terms:{locost7:30,consequ:[14,16,6,18,19,21,25,30],body1vel:0,pxvolumecach:14,linecolorhigh:30,hitbuff:[14,11],pxserialobjectid:[14,19],origin:[1,13,17,18,0,20,7,23,10,26],connect:10,convexmesh:26,alignextradata:27,erequire_rw_lock:21,hard:[8,12,20],jitteri:[28,13],balanc:[14,30,21,7],aabb:[18,12,1,29,11,26],eout_of_memori:17,pxmath:18,pxsweepbuffern:14,pxconvexmeshcookingtyp:[24,6],gettiredrivablesurfacetyp:14,x86:24,allocateobject:27,pxbroadphasecallback:[29,23],seen:[12,25,30],getid:19,usual:[0,16,1,4,19,20,21,7,18,32,11,12,30,22],fclose:19,unnecessarili:[18,7],endian:19,terraindata:12,none:30,express:[18,19,15,30],pxcomputeheightfieldpenetr:[24,1,6],happen:[16,32,18,12,1,7,29,25,11,19,26,30],discard:[9,23,11,0],bat:27,whatev:[20,17],bar:13,minimum:[16,12,1,8,23,24,25,11,0,30],shapegroup1:23,shapegroup0:23,pxextendedvec3:18,frequent:[12,11,7],degener:11,count:10,diff:30,ifdef:26,speed:[16,32,17,18,23,7,29,11,3,26,30],shown:[14,18,30],most:[14,0,16,1,3,17,18,19,5,20,7,8,29,32,11,12,26,30],notori:18,pxident:[14,16,17,19,26,12,30],updateobstacl:18,getnbcontrol:18,enclos:[26,1,11],hasimpuls:7,impress:7,solverbodydata:16,"class":10,decemb:[10,13],pxgetphys:19,pose:[1,13,18,0,20,7,9,10,26],setupsimdata:30,frustrat:7,choic:[17,18,29,26,11,30],gdispatch:9,within:[14,17,16,25,9,18,19,20,21,7,8,29,0,11,12,26,30],sweephitbuffers:14,addid:19,patholog:11,reus:[18,12,25,30],eheightfield:29,onobstaclehit:18,enhanc:[10,13],trick:30,srcasset:19,bodyatoworld:0,sphere:[18,9,12],eenable_pcm:[9,12,7,6],forc:[13,18,20,7,8,9,10],torqu:[0,20,13,10],closestpoint:1,addcollisionspher:14,exactli:[18,30,16,20,7],anchor:[14,0],pxrigidbodi:[14,16,28,7,11,12],pxcachealloc:16,stair:18,eabp:[29,24,12],precis:[18,12,26,3],angularmotionveloc:16,hitactor:14,heighfield:[9,10,13],snippetimmediatemod:16,sight:[1,11],attachshap:[29,17,19],grow:[9,30],step:[12,10,7],atrianglemesh:26,strongli:[17,0,20,26,3,30],street:[18,30],eface_index:[1,6],least:[16,30,6,0,7,26,28,12],mbatchreadi:14,relativepos:26,consider:[20,7,10],tip:18,strcmp:27,stress:0,form:[16,32,17,19,20,7,9,15,26,0,30],eshap:19,extradatastreams:7,inact:25,ecompute_convex:26,onoriginshift:0,emax_num_vehicle_typ:14,notabl:1,overview:10,advanc:10,common:[14,16,13,17,18,12,20,24,10,26,27,0],permiss:15,unregisterseri:27,manual:[16,32,6,4,19,21,29,11,18,0,30],esteer_left_control:14,createcollect:[14,19],box:[18,9,2,12],myobserv:14,feel:[18,16,13],mass:[13,17,12,20,7,10,26],duplic:[30,26],scale:[12,10,17,2,7],lesser:26,resetfilt:[14,16,26],polygon:18,asynchron:[12,10,13,0],principl:17,warranti:15,e2nd_from_front_right:14,client:[28,24],mask:[18,29],describ:[1,23,4,19,20,7,8,29,25,11,12,26,30],escene_query_shap:[18,29],pointer:[14,17,0,6,7,29,24,11,27,23,26,30],head:16,dstasset:19,warn:[14,16,12,1,21,9,25,11,27,26,30],wakeup:[14,16,25,24],explain:[30,26,21],evehicle_type_drivetank:14,eweld_vertic:26,variou:[14,0,16,32,18,19,8,23,25,12,30],transpar:[14,1],lateralslip:14,shrink:[12,25],heightfield:10,thumb:[30,7],maxnbcontactdatablock:[12,25],routin:[23,16,26],island:[12,16,7],dissip:[30,7],reject:[29,30,26],prior:[14,17,18,12,23,15,30],pin:9,halfforwardext:18,mfoundat:4,px_new:14,interv:[28,31],pxcontactpairextradataiter:7,emax_nb_sampl:30,metric:7,featur:[13,18,12,7,8,9,10,26,0],rigiddynam:30,intern:[12,20,10],mgearuppress:30,pxcontactpoint:[14,7],ceas:16,correlationdist:16,etank_wheel_front_right:14,nbcontactdatablock:[12,25],pitfal:12,rectangular:[12,26],getscen:[14,29],resiz:[18,12,11],storag:[30,26,17],style:[0,16,7,23],setsteerangl:30,egear_ratio:14,weld:26,collision_flag_wheel:30,bitwis:11,tirecontactshap:14,situat:[0,16,18,19,21,7,23,25,12,30],discov:30,well:[20,7],presenc:[14,12,21],tabl:[26,13],acapsuleshap:[23,26],strut:30,maxheight:12,pxshare:[13,6],inerti:23,facilit:[19,20,11],getsiz:26,exot:30,edouble_sid:1,satisfi:[19,16,30,0],pend:[14,16],patchcount:7,registerseri:27,flushsimul:[12,16,25],pxcontrollernonwalkablemod:[14,18],event:[14,13,6,18,12,23,7,29,24,0,15],getsteer:[14,30],coher:[30,11,21],pink:30,undesir:18,px_serial_file_align:19,dimens:[12,30],pxclothcollisiondata:14,edynam:[14,13],inward:30,conflict:[18,0,30,3],pxd6joint:[14,0,24,20],pxvehicledrivetankcontrol:14,pairflag:[14,29,30,7,23],detachshap:[29,16],intro:30,deprec:[14,10,13,1],pxpvdsceneflag:32,submarin:[29,7],furthermor:[8,14,25,2,21],snippetpxbvhstructur:11,mtoeangl:30,until:[14,0,16,30,17,18,19,1,21,7,29,25,11,28,26,12],permit:[16,6,12,7,21,15,25,0,30],pxvehiclesdk:3,statist:[10,13],breakag:13,govern:[30,25,26],processtouch:[23,11],mcamberstiff:14,clamp:[1,12,20,7,11,30],easi:[17,4,19,5,7,8,18,30],recent:[12,16,26],recordmemoryalloc:4,e9th_from_front_left:14,interest:[13,20,2,10,7],footprint:[3,32,11],filtershaderdatas:23,investig:22,snippetsplitfetchresult:25,inertiascal:7,hing:[0,20],ewheel:14,pxspatialindex:23,mimic:30,optimum:30,column:[12,26,17],posesweptagainst:1,solv:[14,16,18,0,9,24,30],columnscal:[12,26],e4th_from_front_left:14,both:[14,0,16,30,6,18,19,1,7,21,25,29,24,23,11,27,28,26,12,32],pxdeletionlisten:14,pxmeshpreprocessingflag:[12,26,6],pxcapsulecontrollerdesc:18,intrus:3,sole:[14,16,30],collision_flag_ground:30,evehicle_type_drive4w:14,prepar:27,concret:[19,11,0,17],echannel_steer_left_control:14,mfrontleftrightsplit:30,linvel:16,partit:[0,11],timestep:[14,16,18,0,23,30],assign:[16,23,19,21,29,11,30],slight:[18,12,20,30],appli:[14,17,1,2,9,18,12,20,7,8,29,10,26,0,13],aboxshap:26,lockparticledata:14,mytrimeshshap:26,three:[0,16,18,19,21,7,8,29,25,11,27,12,26,30],fluiditi:7,millikenresearch:30,action:[18,29,30],packag:[18,0,30,21],pxheightfieldgeometri:[12,26],forth:[18,19,16,30],anyhit:[1,6],demonstr:[14,16,18,19,7,9,25,26,0,30],search:[26,1,11,30],shader:[9,7,13],pxfoundationdelayloadhook:24,begin:[12,25,26,7,30],factori:27,getinboundjointdof:20,behind:[30,1],edisable_clean_mesh:[26,6],eaccel_control:14,crash:[29,12,21,3,19],scenequeri:13,report:10,expos:[23,20,1,30,17],dure:[0,16,30,9,18,19,1,21,7,29,23,25,11,27,28,26,12,32],pxvehicletyp:14,pxfilterobjectattribut:[14,29,30,7,23],"public":[14,32,17,4,3,21,7,25,27],menu:[19,32],scenedesc:[9,21,7],disjoint:12,nx_default_sleep_lin_vel_squar:23,helper:[16,23,18,19,1,21,7,29,11,12,26,30],ship:23,pull:[0,30],everyth:[18,1,26,13],repxutil:14,foot:18,shapeb:[19,23],pxvehicledrivetankwheelord:14,materi:[13,12,20,7,15,10,26],etank_wheel_2nd_from_front_right:14,suspjounc:14,arriv:21,downward:[18,16,30],invers:[10,7],invert:18,far:[6,0,1,29,25,26,30,31],edisable_grav:16,"1st":18,popul:[0,10,13,1],rope:12,parallel:[14,0,16,23,12,21,29,25,28,30],thecook:26,transmit:32,cross:[18,16],gameplai:[18,30],prune:[29,11],middlewar:21,becom:[14,16,23,19,21,7,29,12,30],costli:29,pxvehiclemetadataobject:27,swept:[14,30,1,11],machin:21,infrastructur:0,getlatslip:30,side:[16,13,17,25,19,5,1,9,10,26,23,30],lot:[16,18,12,7,23,26],hitinfo:1,shift:[0,10,26,7,13],nxuseroutputstream:23,shapetransform:12,nbvelocityiter:16,neither:[18,0,7,23,15,30],signifi:0,setupfilt:29,computation:30,nxusertriggerreport:23,sethalfsideext:18,lod:[30,21],log:[20,11,21,17],deep:7,pxvehiclewheeldata:13,dstbinfil:19,getqueryresultbuffers:30,mymateri:[29,26,23],edetect_discrete_contact:[14,7],secondari:30,issubordin:27,setmassandupdateinertia:12,edrive_limits_are_forc:0,relat:[14,1,23,18,19,20,7,29,11,30,31],global:[14,16,2,9,4,12,23,29,11,18,0,26,30],compart:13,gcudacontextmanag:9,contactcount:7,wrong:[12,30],shortest:[0,20],foundat:[13,17,8,23,24,10],fold:14,deem:25,dual:27,emodify_contact:7,commonli:30,imperfect:[18,30],forcelimit:[0,20],deploy:3,convexedgethreshold:26,tirelateraldir:14,pxps3configparam:14,px_def_bin_metadata_base_class:27,obviou:30,progress:11,efast:14,develop:[32,3,29,26,12,30],esimulation_shap:29,just:[0,16,17,4,19,20,7,29,25,11,18,12,26,30],complic:30,e7th_from_front_right:14,enotify_threshold_force_persist:7,down:[13,17,18,6,7,10],egpu_compat:[9,12],childattach:20,entireti:19,springi:0,blow:29,getcontactoffset:18,isrigidbodi:6,familiar:[0,20],samplesubmarin:[29,21,7],cookclothfabr:14,addobstacl:18,nxu32:23,minresponsethreshold:0,mgeardownpress:30,group3:11,cach:[20,10],setrestoffset:[23,24,26,7],impact:[14,16,32,2,18,12,1,7,29,11,0],imagin:[12,30],mspringdamperr:30,detach:[29,32,17],did:[16,23,18,7,29,11],pxtrianglemesh:[19,17,26,12],qualiti:[0,8,23,26,11,30],print:17,qryfilterdata:30,pxobstacl:18,overhead:[16,32,12,1,21,7,9,11,30],pxparticlesystem:23,doublesid:1,nxcooktrianglemesh:23,etire_long_slip:14,vc140:[3,24],pvd:[14,2,6,12,5,17,8,10,3,13],nbpair:[16,7],fetchresultsfinish:25,protect:[21,17],nx_max_angular_veloc:23,setstretchconfig:14,univers:0,flatten:7,breakpoint:17,repxupgrad:14,eccd_respons:14,deriv:[32,19,20,7,15,0],properti:[2,9,0,20,7,29,10,27,23,13],selfcollis:29,edrive4w:14,identifi:17,nbhit:[14,11],setrevolutejointlimit:0,appar:7,request:[16,17,12,21,7,9,25,0],mminnormalisedload:30,pxserialframework:27,block:[14,0,13,18,12,29,10,26,28],pxbatchconstraint:16,skill:30,watch:[30,32],addlocalforceatpo:16,getfaceindex1:7,makefil:[3,24],setlat:30,veloc:[10,7],world:[2,17,18,12,5,20,7,10,26,13],getnbcontactblocksus:12,emesh_both_sid:[1,6],ezero_area_test_fail:26,reciproc:30,altogeth:23,peak:30,codepath:18,benefit:[8,9,25,21,0],pxscenflag:23,unitdir:[1,11],mrearbia:30,refactor:14,better:[14,0,23,18,19,20,21,7,29,25,11,12,26,30],from:[1,13,17,18,12,5,20,7,8,9,15,10,26,27,0],stateless:0,procur:15,pxregisterlegacyheightfield:6,pxgeometryqueri:[10,26],pxdumpmetadata:14,restitut:[13,18,0,20,7,10],realli:[29,30],newsprungmass:30,mutual:0,opengl:17,size_t:[19,25,27,17],submiss:21,centimet:[30,7,17],startsimul:21,amd:17,tricki:25,requisit:27,setstabilizationthreshold:12,"16k":[12,13],scenario:[12,21,7,23,11,0,30,31],blade:16,mani:[14,0,30,17,4,19,23,21,7,22,29,25,11,18,28,12],pxoverlapbuff:[14,11],henc:21,iskindof:[27,17],signific:[14,2,6,12,20,21,8,23,24,25,11,26,30],suddenli:[18,12,22],computeinteract:18,edriv:14,debugg:[12,10,2,17],pxhitbuff:[23,11],linearimpuls:7,get:[14,16,32,6,4,19,7,21,29,24,25,11,18,27,23,26,31],getposit:18,great:[18,29,26,7,12],wheelconvexmesh:30,limitpair:0,altern:[14,0,16,32,6,4,19,17,7,29,24,25,26,18,12,30],restdistacnc:7,what:[18,12,10],sweepshap:11,retarget:27,pxphysicsapi:[23,3],tessel:10,einternal_has_face_indic:6,getphysxfoundationdllnam:[4,24],indices32:26,eanalog_input_thrust_left:14,faceindic:26,teleport:[18,29,16,31],hazard:21,inaccuraci:0,some:[20,7,29,25,11,14,17,0,1,23,26,30,31,2,4,8,9,16,18,19,21,22,3,27,12],eenable_ccd_frict:7,physxextens:[14,6,18,19,27,0,30],refit:11,decid:[18,21,8,29,27,30],suppos:16,compens:30,unaccept:[0,23],soft:7,rais:[14,16,9,12,6,7,22,29,25],queryflag:30,written:[23,15,20,11,27],overlapobbshap:23,decis:[12,30],eprofil:[12,32],raii:21,screen:30,give:[16,23,0,1,7,22,29,19],separationdist:7,isserializ:[14,19],pxcapsulegeometri:[23,26],frame:[14,0,16,30,9,18,19,20,7,22,8,29,11,28,26,12],nbmateri:[12,26],emul:[6,18,1,23,24,11],drove:30,contactpatch:[7,6],mbp:[29,9],close:[1,20,7,26,11,30],pxshapeflag:[14,29,18,2,7],slopelimit:18,onc:[16,6,18,19,20,7,21,25,11,12,30],minim:[16,3,18,12,1,21,29,24,25,11,0],occupi:9,coincid:0,total:[14,9,18,12,7,29,11,30,31],bin:[3,24],pxdelayloadhook:[24,13],zoneend:32,anticip:30,displai:[19,25,9],msize:27,pxprofilercallback:32,sign:[4,12],acquirerefer:[17,6],px_delete_serializer_adapt:27,filterdata:[29,30,11,23],pxvehicledrivetankrawinputdata:30,wheelmateri:30,pxvehicleupdatecmasslocalpos:30,particip:[29,30],nbpolygon:26,want:[16,17,12,23,21,7,29,11,0,26],pxserializationcontext:27,setminccdadvancecoeffici:7,spu:[23,10,13],eall:[4,20,32],createregionsfromworldbound:29,contact:10,setlinearlimit:[0,24],math:10,drive:[10,7],triangles:1,safe:[17,18,12,21,7,29,25,0,30],acapsuleactor:[23,26],srcwheel:30,com:[30,32],enginegraphposx:30,conveyor:[16,7],terrain:[12,30,26],than:[12,20,7,3,17],separ:[14,0,16,1,6,18,19,20,7,21,23,32,11,12,30],equival:[14,1,23,0,20,29,26,30],choos:[14,12,30,7,0],technolog:[8,23,30],horsepow:30,hfactor:26,sticki:7,ackermann:13,strong:[18,29,20,30],threadbatch:14,angular0:0,toler:[16,17,4,0,26,30],subtyp:17,eswept_integration_linear:14,articulationlink:20,uniformli:[16,11,7],heightfieldpos:1,pxgeometri:[17,16,29,1,23,11,26],vehiclewheelqueryresult:[14,30],pxhullpolygon:26,file:[12,10,17],jointconnectionhandl:32,afterward:12,pxgeneratecontact:16,write:[14,4,19,21,18,25,27,12,28,30],unbreak:0,compris:[8,5,20,21],getnbobject:19,pxrigidbodyflag:[14,16,7],callbackfinishtask:25,pxtoolkit:[4,14,26],weak:30,inter:[21,7],pxmeshcookinghint:26,finit:7,contactstreamcapac:9,areatestepsilon:26,efirst_user_extens:27,buildtool:3,eprevent_climbing_and_force_slid:18,friendli:16,edrive_en:0,asleep:[16,30],setlimit:[25,12,20,0],refer:10,hitcount:1,richer:[24,6],cloud:26,estiff:14,blend:30,llindex:20,pxtoler:17,settransform:16,therebi:[30,25],basic:[2,17,18,12,10,27],outermost:12,e6th_from_front_right:14,afford:30,pxraycastbuff:[14,11],isrigidstat:6,simplifi:[14,19,12,11,28],pxconstraintsolvehint:0,corioli:20,adjust:[14,17,0,7,9,12,30,31],getcontactpoint:7,exceed:[0,20,1],pxvehiclecopydynamicsdata:30,meshcookinghint:[12,24,26,6],gpudispatch:[9,21],slower:[18,16,30,17],direct:[1,18,12,20,21,7,23,15,11,0,26,30],src:[30,27],calcul:[14,16,1,12,20,21,7,0,30],pxheightfieldflag:[12,26],pxsolverbodi:16,pxvehicletireloadfilterdata:13,deform:10,snowmen:16,erevolut:20,gettolerancesscal:[9,21],unbound:[14,0,10,13],pxplane:17,callback:[10,7,17],edisable_simul:[29,16],inclin:30,steal:21,pxclient:24,tiresurfacemateri:14,velocitythreshold:0,click:5,definit:[19,30,27],etrigger_shap:[29,7],userraycastresultbuff:[14,30],digit:30,pxlightcputask:[14,21],precomput:[12,26,11],option:[14,0,9,16,2,1,6,18,19,5,20,7,8,29,10,11,27,12,26,13],immedi:[14,13,18,7,10,26],settessel:18,mspringstrength:30,handbrak:30,pxscenequeryfiltercallback:[14,6],requir:[14,17,16,1,3,29,18,6,5,20,7,9,23,0,11,27,12,26],recipwheelradiu:30,dai:3,util:[17,4,19,20,23,25,26,30],epre_solver_veloc:7,updatest:21,settodefault:26,fulli:[18,12,7,8,11,0],linux:[19,5,24,27,9],contactimpuls:[7,6],aligndata:27,resist:[0,16,20,30],ftell:19,createcach:20,fread:19,fire:[5,21],gavoidedphysxtyp:27,wors:[20,26,21],pxvehicleextensionapi:27,thick:[26,6],getnam:21,partner:25,setcontactreportthreshold:7,pxdefaulterrorcallback:[4,17],"int":[14,25,17],pxtransformfromseg:26,setupdrivablesurfac:30,word:[18,23,11,7,17],advancecoeffici:7,rigidli:0,chassismateri:30,resttireload:30,x86_64:24,inf:[1,11],work:[14,0,16,1,3,6,4,28,17,7,21,25,29,24,23,11,18,12,30,22],accessor:14,road:30,runtim:10,pxcontactpairheaderflag:[21,6],files:19,pxrevolutejointflag:0,diagnos:12,coordin:10,maxqueri:14,"0xcd":27,although:[0,16,19,5,20,7,29,11,12,30],restart:1,onto:[9,30,0],inflat:[12,23,1,7,6],nx_timestep_fix:23,pxregisterarticul:4,"2nd":[18,30],dll:[13,3,8,9,24,10],pxarticulationjointreducedcoordin:20,mcamberatrest:30,contactbuffercapac:9,engin:[0,13,23,18,12,5,8,29,28],pxgeometryhold:10,confus:[14,18],setrigiddynamicflag:29,complianc:20,via:[14,2,17,0,1,21,23,25,11,30],inelast:16,access:[14,16,13,17,18,19,20,7,22,29,23,10,11,0,26],pxbroadphasetyp:[29,24,12,9],suppli:[19,20,26,30,0],input:[17,18,12,5,1,25,11,0,26,30,31],disappear:12,testforoverlap:26,fluid:23,eengine_rev:14,otherwis:[14,18,0,7,29,15,11,3,26,30],slip:30,reson:16,getexternalrefer:0,upon:[29,25,32,30,0],mention:[9,16,12,30,6],dostuff:14,newli:[14,6,19,7,21,29,24,11,12],cct:[12,10],tangentialspr:14,iter:[1,13,6,12,20,7,29,10,26,0],minheight:12,pxdefaultcpudispatchercr:[9,21],simpler:[23,30,21,7],wheelomega:30,item:25,constraint:[13,18,12,20,7,8,9,10],mismatch:[4,30],pxvehicledifferentialnwdata:30,temp:9,surpris:25,dstmetadata:19,pxtype:27,oncomshift:0,put:[14,18,3,20,21,7,29,11,26,30],alter:[30,20],numrow:26,inconsist:[12,21,0],pxf32:[14,12,1,18,30],minehead:29,unvalid:26,resultbuff:14,registri:[19,27],preced:[30,20,21],createconnect:[14,25],pxvehicleisinair:14,revers:[30,1,26,21],removeactor:16,pxvehicledrive4wwheelord:[14,30],row2:26,row0:26,row1:26,longitudinalslip:[14,30],architectur:[12,21],manifold:10,download:32,reset:[14,16,20,25,11,30],boxgeom:26,awkward:26,care:[16,17,0,21,29,28,30],player:[18,12,30,7],numid:19,startup:[5,10,26,13],declar:[14,29,0],littl:[18,12,16,30,21],getwheelrotationangl:30,maxsweepsperexecut:14,pxu32:[14,0,16,1,2,9,4,19,20,7,21,29,23,25,11,18,27,12,26,30],enableshapeincontacttest:29,crouch:18,implicit:[14,28,0],etank_wheel_9th_from_front_left:14,pxgaussmaplimit:6,etwist:[0,20],toolkit:26,crab:[29,21],planetoid:18,obei:[30,25],pximmediatemod:16,list:[14,16,32,23,19,1,29,15,25,30],numtrisperleaf:[12,24,26],remedi:30,registerrepxseri:27,pxprofilezon:24,nx_default_sleep_ang_vel_squar:23,robust:[14,16,18,12,20,7,8,26,30],deal:[18,19,12,7,28],samplecustomgrav:[18,16],bouncethreshold:16,getsuspjounc:14,core:[13,18,12,8,10,27,3],pxsetphysxgpuloadhook:4,underneath:30,physxfoundation_x64_renam:4,caveat:[16,17,19,20,9,30],etank_wheel_3rd_from_front_left:14,pxvec:26,seamin:7,bugfix:19,effici:[16,2,18,19,7,21,29,11,26,30],pxvehicledrivetanksmoothdigitalrawinputsandsetanaloginput:30,whether:[14,16,1,30,3,9,4,19,6,7,21,29,15,11,27,0,26,22],aris:[29,16,30,15,31],dedic:[14,18,1],phase:[20,10],arbitrarili:[0,1],plain:[14,19],valu:[12,20,7,17],unevenli:30,cascad:27,onjointbreak:0,illeg:[25,21],upgrad:[8,23,24,6],incur:[9,16,20,1,12],setdigitalhandbrak:30,pxconstraintproject:0,elimin:30,"catch":12,ccdmaxpass:7,mswitchtim:30,etank_wheel_1st_from_front_right:14,behaviour:6,regular:[14,16,12,29,24,26],summar:18,"abstract":[14,21,17],chanc:[31,11,21,7],gsteervsforwardspeeddata:30,intend:[18,29,31,19],jth:26,chang:[13,17,18,12,20,7,8,10,0],lack:[18,30,20],elimit:[0,20],respect:[14,17,16,3,6,12,20,7,21,8,9,11,0,26],omit:[4,3,1],pxrigidbodyext:[14,16,12,20,23,26],pxtrianglemeshdesc:26,disp:18,maxraycasthit:14,enabl:[14,17,9,16,2,1,6,4,19,5,20,7,8,29,23,0,26,18,12,30,32],hash:[23,16],"50m":12,proxim:[8,7],createvehicleactor12w:30,recoveri:[20,10],baseid:19,commensur:30,postsolverveloc:7,overridden:17,verifi:[26,27],stepper:28,flushqueryupd:11,hood:18,icon:5,aheightfield:26,user_releas:6,pxscenelock:21,pxrepxseri:[14,27],youtub:30,snippetraycastccd:7,pxcontrollermanag:[14,18],e3rd_from_front_left:14,linear:[18,0,16,30,7],began:14,ramif:14,sort:[18,30,7],evelocity_chang:[16,30],idbuff:19,setgroupcollisionflag:23,gaussmaplimit:[12,24,6],plugin:0,maxvert:26,setsceneparamint:14,theori:[0,15,30],setminlongslipdenomin:30,contactdesc:16,solvehint:0,degre:[18,0,16,20,30],height:[18,12],environment:23,sometim:[16,2,18,12,1,11,0,26],pxbatchqueryprefiltershad:11,physxsampl:26,equickhul:6,restrict:[16,1,23,18,20,21,29,25,30],maxconstraint:0,gravit:[14,16,30],homogen:17,regener:18,savecel:26,convex:7,pxrigiddynamicactor:30,acceler:[12,20,7,10],previous:[14,16,6,18,12,1,29,24,11,19],getnbrow:26,pxcontrollerfiltercallback:18,convexdesc:26,gettriangl:[1,26],window:[3,17,19,1,9,24,25,11,27,23],emax_nb_drivetank_analog_input:14,filtermask:29,erear_left_wheel:14,customclass:27,eccd_linear:14,pxallocatorcallback:[23,25,27,17],trepxid:14,wheeltorqu:30,face:[9,10,12],sqrt:[30,26],hook:[4,23],caus:[14,0,16,30,3,19,21,7,29,15,25,11,12,22,31],lockwrit:21,pxquat:[14,17,12,20,23,26,30],thank:18,crack:7,unpack:[8,26],rim:30,amateri:26,nx_ignore_pair:23,nx_skin_width:23,variabl:[14,32,18,12,7,8,29,30],nxsphereshap:23,surprisingli:12,disabl:[7,17],might:[16,30,2,3,6,18,19,7,21,29,24,25,11,12,26,22],filter:[1,13,18,0,20,7,9,10],freeli:[18,0,30],attributes0:[14,29,30,7,23],attributes1:[14,29,30,7,23],associ:[14,16,17,0,21,9,26,30],ecallback:29,pxarticulationjoint:20,foundlostpairscapac:9,game:[18,19,5,20,21,7,8,29,25,12,30,31],mrearwidth:30,infinit:[16,18,28,1,7,0],memory128:19,more:[20,7,29,25,11,12,14,17,28,1,23,26,13,4,8,9,10,16,18,19,6,21,22,27,0],repres:[0,16,32,3,17,18,12,20,8,29,25,11,19,26,30],camberangleatrest:30,suspens:30,px_assert:[26,27],diff4w:30,extrem:[9,1,7,30,12],runprofil:21,pxvehicledrive4w:14,snippetconvert:13,suspend:[29,30],behav:[0,16,20,7,30],usag:10,snippetvehiclewheelcontactmod:30,titl:30,pertain:4,attempt:[9,18,12,1,21,7,29,0,30],pxi32:14,recycl:7,snippetvehiclescal:13,rcvd:30,emax_nb_vehicle_typ:14,removerefer:[25,21],spring:[14,0,20,30,23],troubl:[18,12,7,31],"null":[14,1,4,19,20,21,18,25,11,27,0,26,30],organ:14,smooth:[28,30,26,7,0],hit:[12,10],hfsampl:12,faster:[14,0,32,18,12,1,29,11,19,26,30],centimetr:30,"_debug":26,nasti:0,explanatori:25,pxvehicleenginedata:13,pxcreatebatchqueri:14,linecolorlow:30,px_new_serializer_adapt:27,physx3checked_x64:24,him:18,rearrang:30,centr:30,unbal:11,mwheel1:30,edetect_ccd_contact:7,malloc:[25,19,23,17],setkinematicsurfaceveloc:16,wish:[4,16,30],unsign:[14,19,23],planetoler:26,clearli:[30,25],accept:[16,12,7,24,26,3,30],lie:[0,26],lib:[19,24,3],nxgetcookinglib:23,px_foundation_vers:[24,6],travel:[18,30],now:[14,16,6,4,28,17,7,21,18,24,25,23,12,26,30],glmatrixmod:17,nor:[18,0,15,23],setmass:[12,16,30],stride:[12,26,11],eenable_activetransform:24,pxsolverconstraintdesc:16,getlinearlimit:[0,24],pxclothfabriccook:14,sample_alloc:29,non:[14,16,9,4,12,6,7,29,24,23,11,18,27,0,26,30],frictionoffsetthreshold:16,pxvehicledrivesimdatatank:30,aggreg:[12,10,13],thu:[16,2,18,12,1,7,29,11,26],learn:[8,5],wakecountervalu:16,dash:12,addarticul:20,px_profile_start_crossthread:24,elsewher:23,"case":[12,20,7,10],pxraycastcallback:[14,11],straight:[18,30],pseudo:[29,30,12],mode:[12,10,7,3],pvdclient:13,spherepos:1,mark:[6,4,12,21,8,18,25,27,30],meshweldtoler:26,cast:10,pvd_host:[4,32],tradition:18,e5th_from_front_left:14,broadphas:[29,24,22,26,12],edefault:[29,1,11,7,23],targetveloc:0,pxconvexmeshgeometryflag:[12,26],obvious:[18,30],convinc:[18,7],geterrorhandl:32,buildgrbdata:[9,12],assist:[12,0],gtreasurefound:29,cosf:18,sethalfheight:18,setenginedata:30,pxvehiclesetbasisvector:30,angularimpuls:7,silent:12,getsuspraycast:14,gettwist:24,environ:7,clip:[12,26,11,7],groups1:23,inbound:20,snippetvehiclemultithread:13,px_max_f32:[12,20,30,0],aheightfieldshap:26,explos:11,eenable_enhanced_determin:[12,16],pxheightfieldformat:[12,26],mactor:27,pxerrorcallback:[23,17],somewher:[18,30],mixtur:[0,7],pendulum:30,filtershaderdata:[23,11],vertic:[18,12,1,9,26,30],broadphasetyp:[9,12],larg:[14,17,16,1,2,30,9,4,19,20,7,22,8,29,23,0,11,18,28,26,12],pxbvhstructuredesc:11,topolog:[26,11],overflow:[14,11],egearsratio_count:30,getindexbuff:26,pxserializ:14,level:[14,16,1,13,3,17,18,12,5,20,7,23,26,0],mdrivesimdata:30,localfram:0,autogener:27,broad:10,accordingli:[30,31,6],epost_solver_veloc:7,"3rd":18,cambertireforc:30,model:[16,1,13,17,18,12,20,10,0],quick:[19,30,22],pxpi:[0,30,26],pxmeshmidphas:[12,26],wrapper:26,symmetr:23,earlier:[14,12,1,26,30],nv_use_debug_wincrt:3,volum:10,certainli:30,half:[1,26],pxi16:12,present:[16,19,5,0,30,12],"4x4":23,tire_type_norm:30,aka:21,insight:22,drift:[0,20],suppresstrianglemeshremapt:[12,26],halv:26,evisu:[0,2],denot:[11,21],setwheelrotationspe:30,pxjointangularlimitpair:[14,0],invalidatecach:18,held:[12,25,0],simultan:[14,19,21,7,29,0,30],realiti:21,setdrivetarget:20,econtact_event_pos:7,pxscenequeryreport:14,renderbaseactor:26,px_c_export:[16,30],serializecollectiontoxml:[14,19],element:[12,30,17],wheelcenteractoroffset:30,knowledg:30,quantifi:30,vehdrive4w:30,impulsiveforc:16,help:[14,32,6,12,21,22,8,25,26,27,0,30],startgearchang:30,pxmetadata:27,samplenorthpol:[18,26],handler:[32,17],bouncethresholdveloc:23,line:[14,0,16,2,3,17,18,19,1,21,29,24,25,11,12,26,30],setlinearveloc:[14,16,25],cuda:[8,9,21],vertexindic:1,spot:30,unregistercustomclassrepxseri:27,everythingcollect:19,cooktrianglemesh:[23,26],pheightfield:26,starv:21,link:[4,28,20,7,29,24,26,19,30],start:[14,0,16,1,30,6,4,19,5,20,7,21,29,23,25,11,18,27,28,26,12],carefulli:[14,18,23],suspsprungmass:30,pxinputstream:19,myphys:30,fdani:[14,11],nvcloth:24,back:[14,16,1,9,18,12,20,21,7,29,24,25,23,0,30,32],elbow:0,longslip:30,adjac:[28,26],equal:[12,16,30,21,0],though:[16,17,18,23,29,25,12,30,31],mstepsiz:[28,23],processhit:14,fatalerror:[4,16,21],subtl:7,lightcputask:[14,13],maccumul:28,equat:[16,17,0,26,11,30],broken:[0,21],snippet:[12,20,7,9,10,26,3],specul:10,ecrab:29,numvehicl:14,air:[14,30],hierarch:[28,11],mcentrebia:30,receiv:[14,16,6,19,7,21,23,25,11,0,30],aheightfieldactor:26,guidanc:6,snippetdelayloadhook:4,getnbshap:29,jitter:7,badli:[30,22],lin1:0,lin0:0,contactoffset:[18,23,7],econtact_default:[29,30,23],complex:[0,16,32,18,12,5,7,8,9,25,28,30],aim:[12,30],consid:7,pictur:18,unless:[16,6,19,7,29,25,0,30],mcpudispatch:21,store:[14,17,16,1,3,9,25,19,20,21,23,29,24,0,11,12,26,30,32],disableshapeincontacttest:29,transforminv:20,enableshapeinscenequerytest:29,smallest:30,erigid_stat:[29,19],getwakecount:16,analog:[0,30],getupdirect:18,pxactor:[14,16,6,18,19,17,7,29,23,28],contactmodifycallback:7,numbound:11,instead:[14,0,16,1,3,6,4,19,20,7,29,24,23,11,18,12,26,30],altitud:18,next:[14,0,16,6,18,19,17,7,21,23,25,11,12,30],reproduc:[19,15],uint64_t:32,postfilt:[14,11],imposs:30,glpopmatrix:17,detriment:12,nbtriangl:1,francisco:18,hadhit:14,kinekinefilteringmod:[16,7],togeth:[16,1,19,20,7,8,29,11,0,30],autobox:30,exist:[14,32,17,18,12,1,7,8,29,23,11,0,30],address:[18,23,31,27],px_max_num_wheel:14,cone:[0,20],alwai:[14,0,16,3,6,4,19,1,7,21,29,24,23,11,18,12,30],pxpvdsdk:24,inde:[18,30],check:[7,17],mfrontrearsplit:30,triangl:[2,18,12,7,9,10],encount:[14,18,12,22,11,0,30],lend:[23,7],incollect:27,upward:[18,30],clearforc:[14,25],pass:[14,17,16,6,4,19,20,7,21,25,29,23,0,11,18,12,26,30],serializ:[14,13],past:[18,12,11],snippetnodr:30,execut:[14,23,12,5,21,29,25,11,0,30],repxextens:14,motion:[0,16,18,12,20,7,29,26,28,30],nearer:30,createconvexmesh:26,cheaper:[0,30,7],concern:[18,19,1,21,6],signatur:[4,0,24,1,6],createrigidstat:[12,16],nutshel:7,collid:[16,23,18,7,29,25,26,30],lane:16,land:18,anti:13,einvalid_paramet:17,encod:[29,30,0,12],theme:30,fourth:16,space:[14,16,1,17,18,12,20,8,29,23,11,0,26,30],midphasedesc:[12,26],observ:[14,30,25,26,17],impli:[19,15],pxfoundat:[17,4,6,24,25,30],"switch":[14,3,18,12,21,29,24,26,19,30],syntax:23,constraintbuffercapac:9,pxfilterflag:[14,29,30,7,23],physxfound:13,grid:[29,26,11,12],linearveloc:7,outlin:12,easier:[4,12,30,0,6],visibl:[18,12,21,8,25,11,30],scalabl:8,interact:[12,20,7,10],particular:[14,16,1,18,12,20,21,7,29,24,0,30,15],cleartorqu:[14,25],sln:5,sweepresultss:30,revolut:[20,13,10],addcollect:[14,19],helloworld:10,setscenequeryfilterdata:[14,30],ebuild_enabled_commit_dis:24,anisotrop:[0,13],shutdown:[5,10,26,13],pxqueryfiltercallback:[14,23,1,11,6],segment:[1,26],pxvehicledrivegraphchannel:14,updat:[12,20,7,10],remain:[16,1,17,18,0,20,7,29,11,26,30],miscellan:9,retest:30,unit:10,createbatchqueri:[14,30,11],echannel_suspforc:[14,30],createcontrol:18,consequenti:15,setup:[14,13,21,29,24,10,11,30],removed_shape_oth:6,egraph_type_engin:14,elapsedtim:18,batch:[12,10,13],parent:[29,20,0],setmetadata:19,createvehicleactor:30,addion:26,have:[20,7,29,25,11,12,14,17,0,1,23,26,30,2,4,8,9,16,18,19,6,21,22,24,28],complementari:30,refilt:23,exce:[16,0,7,9,25,12,30],anatom:[0,20],setwakecount:[14,16,25],pxrigidstat:[28,16,11,0,12],subgraph:0,legaci:[6,18,7,9,24,30],computelinearangularimpuls:7,abbrevi:1,lowlimit:20,drivetyp:23,unimport:12,reclaim:[12,25],echannel_normalised_tireload:30,pxd6drive:0,obj:27,revert:30,nthread:14,tunabl:30,readextradata:27,obb:26,guidelin:13,concurr:[19,21,7,29,25,30],mainli:[12,22],gone:[23,24],typedef:[0,30],tempor:[8,18,10,13],pxpairflag:[14,23,21,22,29,24,7,30],neglect:16,misus:16,lastli:30,setdampingcoeffici:14,matter:[18,30],pxbvh34midphasedesc:[12,24,26],decad:18,abbot:30,px_binary_serial_vers:19,movabl:0,telemetri:13,espu_raycast:14,discret:[14,18,28,7,8,9],correct:[0,16,13,17,18,19,20,7,24,25,26,12],increment:[16,6,19,17,21,8,23,11,12,30],fight:0,centrifug:20,shrunk:12,linker:4,prefilt:[14,23],serial:[10,17],cannot:[14,16,6,18,19,7,21,29,25,11,23,26,30],hull:[9,12,26,6],advantag:[14,16,18,28,21,7,8,9,25,12,30],bvh33:[24,6],trianglemesh:[4,29,26],pxcreatecollect:[14,19],diagram:[28,1,30,12],pxcapsulecontrol:18,manner:[18,30,26],domain:8,enginegraphposi:30,metal:23,still:[14,16,6,4,12,7,21,9,24,25,26,18,3,30,31],pxcollet:19,pxmeshscal:26,pxoutputstream:[14,19,27],friction:[18,10,7,13],sure:[14,2,18,12,29,26,27,30],emtd:[12,1],top:[17,4,12,5,18,26,30],pxrigidactorext:[17,16,6,19,20,29,24,23,11,12,26,30],orphan:19,writebuff:26,setmaxprojectioniter:20,collectionscen:19,setinvinertiascale0:16,nativ:9,elong:30,slow:7,slot:21,ptr:[25,17],outsid:[6,18,19,7,21,8,29,30],pxboxobstacl:18,plastic:23,materialcollect:19,barycentr:1,rout:32,reflect:[16,18,28,21,7,25,26,30,31],triggershap:29,getnbcontactdatablocksus:25,simplest:[28,5,29,25,11,30],underli:[18,0],includ:[20,5,7,29,11,12,14,17,3,1,23,15,26,30,32,8,16,19,6,21,24,27,0],physic:[20,10],perceiv:[28,12],pxcontrollerbehaviorflag:18,sampleallocat:21,speak:[18,29,11,12],stream:7,elimit_en:0,constrain:[16,4,19,7,18,0,30],midphas:[26,11],group:[14,16,23,1,29,11],etc:[14,9,16,3,6,18,12,5,20,7,22,8,29,25,26,0,30],getenginerotationspe:30,px_min_heightfield_y_scal:12,gettiredrivablesurfacecontactpoint:14,show:[14,0,16,28,1,21,19,29,26,27,12,30],pxscenequeryupdatemod:[24,10,13],cosin:18,einitial_overlap_keep:14,bodyaworldoffset:0,pxcloth:[14,23],modifysampl:26,apart:[25,26,7],ineffici:30,locat:[16,17,18,19,20,7,25,12,30,31],maximpuls:[0,7],jacobian:20,local:[14,16,2,6,12,1,7,29,24,23,11,0,26,30],echannel_engine_drive_torqu:[14,30],intersect:[23,1,29,11,26,30],enforc:[14,0,20,30],setdistancejointflag:0,walk:18,bridg:[18,12],wall:[10,7],coulomb:16,impract:30,mlatstiffx:30,seek_set:19,fifo:21,pxraycasthitbuff:14,disconnect:[30,32],integr:[14,17,16,2,6,0,20,8,23,24,26,13],unlockread:21,doll:[19,20,21,0],your:[17,16,32,2,3,9,4,12,6,21,29,24,23,11,18,0,26],themselv:[18,21],incred:30,question:[8,23,16,30,32],mlatstiffi:30,startindex:1,getrigiddynamicactor:30,pxvehicledrivetankcontrolmodel:14,much:[0,16,13,23,18,28,7,19,29,25,26,12],rev:30,"_lignv3ptiq":30,stopsimul:21,toe:30,getinboundjoint:20,boundari:[19,1],flex:[18,24,6],sai:[23,18,12,29,19,30],whenev:[29,28,11,7,19],glanc:30,tempt:[18,28],san:18,pxcontactpairhead:[25,7],categori:30,sat:[7,6],px_debug:17,customcreatephys:4,termin:[4,14,7],substanti:[14,0],sweepdist:11,iscomplet:25,wai:[20,5,7,29,25,11,12,14,17,3,1,23,15,26,30,32,28,16,18,19,6,21,0],pxcreatephysicssdk:23,cylindr:[0,30],"import":[16,32,6,4,19,17,7,21,8,18,23,25,26,27,12,30],pxextens:[10,13,1],cachedindex:1,unfortun:[18,29,16,30,23],unnatur:[18,13],pxconstraintflag:[14,0,6],instruct:[12,17],innov:8,run:[14,0,16,22,2,3,9,19,5,20,7,21,8,29,24,25,11,27,28,30,12],pxclothfabrictyp:14,userraycasttouchbuff:[14,30],getinternalfaceindic:7,"\u03c9":24,mstrength:30,pxoverlapbuffern:14,setshapepairflag:23,win32:5,processblockinghit:14,mdownratio:30,quat:26,belt:[16,7],isn:30,massscal:7,quak:18,pxcontactset:7,mratio:30,pxcomputemeshpenetr:[24,6],kepler:9,presiz:[12,25],energi:[16,0,20,7,8,23,30],pxsimulationeventcallback:[16,6,0,7,21,29,25,23],tireforceappcmoffset:30,alloc:17,reloc:18,totalmass:30,terraindesc:12,overrul:7,mmaxhandbraketorqu:30,nbcol:26,preserv:[18,19,26,0,30,31],ragdol:[0,29,20,7,12],setbraketorqu:30,contributor:15,hello:5,doubl:[13,18,5,1,7,9],convexinputdata:19,liabl:15,allow:[5,7,29,25,11,12,14,17,0,20,23,26,30,32,16,18,19,6,21,22,27,28],ecapsul:26,basi:[9,17,12,20,29,30],gvehicle4w:30,mmaxsteer:30,gener:[12,20,7,10,17],slerp:0,echannel_clutch_slip:[14,30],purgecontrol:18,map:[14,19,1,26,30],boilerpl:0,tangenti:[20,7],max:[29,20,11,30,19],setangulardamp:16,getactivescen:14,cloth:[23,10,13],mac:19,studio:[3,5,24,17],forget:[4,30],fpu:[18,17],pxheightfield:[26,17],mai:[20,7,29,25,11,12,14,17,0,1,23,15,26,30,31,4,8,9,16,18,19,6,21,3,27,28],pxscenequeryimpacthit:14,xmlreader:27,instanti:[14,19,30,26],pxtask:[14,24],artifici:[18,29,12],activ:[14,13,23,18,12,7,9,24,10,0],futur:[14,16,6,18,29,30],bqthread:14,seriou:[29,17],histor:12,max_num_tire_typ:30,getnbvertic:26,steer:[0,13],number:[14,17,16,1,30,9,18,19,5,20,25,21,7,8,29,23,0,11,12,26,22],tether:14,built:[32,13,18,19,5,7,8,3,24,10,11,12],pxcontrollerflag:14,reconsid:16,sse:17,network:[4,19,13],setconstraintflag:0,batchhead:16,suffici:[16,1,17,19,20,7,9,23,11,12,30],avail:[14,16,1,3,9,4,28,6,7,21,29,24,25,23,18,12,26,30],eanalog_input_thrust_right:14,build:[20,10,17],underw:14,eblock:[14,30,11],data:[10,17],eenable_speculative_ccd:7,except:[0,18,19,1,21,3,30],macro:[4,24,17],third:16,triplet:26,getsuspensionforc:[14,30],etight_bound:[12,26],conserv:[0,7],pxhitflag:[14,6,12,1,24,11,30],portabl:8,perfect:[18,30,7],pxccthit:14,"long":[14,17,18,19,21,11,30],travers:[0,11],captur:[29,11],pxspatialoverlapcallback:23,deleg:17,after:[14,0,16,1,2,22,4,19,20,7,21,29,23,25,11,18,28,30,12,32],extrud:[18,6],reduct:0,mmaxbraketorqu:30,clarifi:30,correctli:[18,16,30,21,32],enorm_tire_lat_forc:14,getbehaviorflag:18,lockclothreaddata:14,child:[0,20],good:[1,18,12,5,20,8,29,15,32,11,0,30],pattern:[17,0,21,27,30,31],echannel_norm_tire_long_forc:[14,30],green:[18,21],pxcontactpairflag:[7,21,6],ecct_user_defined_rid:18,pose0:[16,1],pose1:[16,1],backgroundcolor:30,solverbodi:16,setwheeldata:30,later:[14,16,9,18,20,29,11,26,30],trystandup:18,instrument:[3,25,17],objecttofileimpl:27,insuffici:[9,7,12],wheel:[14,0,13],markserializedmem:27,erear_right:[14,30],depend:[14,0,16,32,3,17,4,19,1,21,7,29,24,25,11,18,27,12,26,30],nxjointdrivedesc:23,mypvdclient:32,hitflag:1,vehiclefiltershad:30,pxcook:[14,19,17,11,6],tiresurfacetyp:14,aconvexactor:26,pxraycastqueryresult:[14,30],createbinaryconvert:[14,19],fabric:[14,6],tire:[14,13],persist:10,touchdist:11,memori:10,nqueri:14,shell:[16,1],"18m":12,overrid:[23,16,32,11],downcast:23,meshshap:29,radiu:[14,18,0,23,26,30],thousand:9,stand:[18,12,8,29,26,19,30],pxscenequeryfilterflag:14,setackermanngeometrydata:30,merchant:15,significantli:[14,0,16,32,12,29,11,19,26,30],setdiffdata:30,elock_angular_i:16,fetchresultsstart:25,accomplish:30,privat:27,sanitybound:12,inarg:27,testccdfiltershad:7,msvc:17,actormass:30,temporarili:[23,30],primarili:17,removeaggreg:29,pxactortypeselectionflag:[14,6],benefici:[3,25],"char":[14,32,17,4,21,25,27,30],text:[19,2],trigger:[10,17],getshap:[29,26],graphic:[10,7,17],pxdeletioneventflag:14,alongsid:5,expect:[16,23,4,12,7,22,29,11,19,26,30],custompath:4,deltalinearveloc:[16,7],sqhitbuff:30,"25f":[30,26],sacrific:[16,20],modifi:[14,17,16,1,6,18,12,20,7,21,9,11,0,26,30],errorstream:32,isparticlefluid:6,pxvehiclewheelsdyndata:[14,30],api:10,eabort:17,setdominancegrouppair:16,modul:[12,10],cost:[9,12,20,7,29,11,30],extent:[18,29,30,26,0],middl:18,earli:7,creat:[10,7,17],affect:[14,16,6,12,20,7,21,9,24,30],mandatori:[11,21,7],extens:[13,17,18,0,1,7,8,9,10,27,3],getsuspensiondata:30,product:[2,1,21,8,15,30],eerror:20,extend:[18,10],mysimulationcallback:7,leg:[18,0],highend:27,getangularveloc:[0,16],dist:1,constantblock:[14,23,0,7,29,11,30],translateptr:27,setdrivenwheel:30,fact:[8,29,30,7,6],pxcontactpairpoint:7,buildtriangleadjac:[12,26],e1st_from_front_left:14,deseri:[14,29,13,27,17],nv_use_static_wincrt:3,fetchqueri:11,intermedi:17,setusermemori:14,disc:0,wheelgraphposx:30,skid:30,heurist:27,sciencedirect:30,setswinglimiten:20,disk:19,skinwidth:[23,24],pxclothstretchconfig:14,famili:30,pxvehiclegraph:[14,30],figur:[19,30,21],conjunct:[9,1,7,30,12],targetwheelid:30,pxvisualdebuggerconnectionflag:25,signalquit:14,quaternion:[0,23,20,26,17],understand:[16,18,0,8,29,12,30],task:[12,10,13,23],inertia:[16,30,28,7,26,0,12],impuls:[18,0,16,20,7],decrement:[30,16,17,21,6],cover:[16,17,23,29,27,0,12],eoutput_forc:0,pxrigidactor:[14,13,17,0,1,26],succinctli:23,setreportallocationnam:[25,17],maxraycastsperexecut:14,slope:[18,13],equilibrium:[16,7],"3333f":30,effort:[18,30,7],createident:14,inferior:29,heavili:30,pxpairfilteringmod:12,alpha:30,cooker:[14,26],createshap:[14,23,13,17],tini:[18,7],setrigidbodyflag:[14,16,7],directori:[6,3,5,8,23,24,27],pxconstraintdomin:14,inputdata:19,flexibl:[14,29,16,19],walkabl:10,doesn:[19,30,7,27],plane:[9,10,7,17],pxactorflag:[29,16,25,2],spread:30,eremove_duplicated_triangl:6,physxvisualdebuggersdk:3,acut:12,col:12,getrigidactorshapelocalboundslist:11,outer:[12,30],corpor:15,animateleav:11,debri:[29,26,12],drivetargetveloc:20,mmass:30,normal:[16,12,1,7,11,26,30],enorm_tire_aligning_mo:14,synchron:[19,10,13,12],those:[16,9,4,0,17,21,7,29,25,26,18,27,30,31],querycli:14,would:[14,0,16,1,6,18,19,20,7,29,23,25,12,30],actorcollect:19,subtract:[12,31],brief:[9,10,13],pxclothphasesolverconfig:14,naturalfrequ:30,html:[8,30],gettirelongitudinaldir:[14,30],cmutil:27,spend:30,dumpbinarymetadata:[14,19],pxvehiclesuspensionsweep:30,"__file__":[25,17],interpret:[14,16,0,1,11,26],getnblin:2,hint:[0,11],round:[18,12,1,30],uniform:[23,16,7],spent:[18,30],pxconvexmesh:[19,17,26,30,12],truck:30,euler:20,contain:[0,30,1,3,17,4,19,5,20,21,22,8,29,24,23,11,18,27,28,26,12],aconvexmesh:26,descriptor:[14,18,26,27,23],scanforobstacl:21,nextitemset:7,surfac:[1,13,0,20,7,23,24,26,12],plu:[27,17],nx_dyn_frict_sc:23,extern:[14,16,1,13,18,19,20],submitbatch:14,sound:[18,30],made:[14,17,18,12,20,21,7,8,29,25,11,0,26,30],getphysicsinsertioncallback:[4,12,26,11,6],pxvehiclecomputetireforc:30,licens:[10,13],samplebridgesset:18,tirelongitudinaldir:14,arrai:[14,0,16,1,19,20,7,11,12,26,30],tilt:16,enough:[18,29,30],base:[14,17,16,1,9,18,0,6,7,21,8,29,24,11,27,23,26,30],metr:30,exhaust:[8,30],ebrake_control:14,pxbinaryconvert:[14,19,27],mradiu:30,qanda:30,upper:[14,0,20,30],me485:30,createbvhstructur:11,geomsweptagainst:1,meta:19,inform:[10,17],blocking_sweep:30,experiment:30,mminfilterednormalisedload:30,explod:7,eactor_ax:2,pxarticulationlink:[28,20,11,7,0],"3x3":[23,16],setsimulationeventcallback:25,px_check:[25,27],init:23,first:[16,1,2,17,18,12,20,7,29,23,10,11,27,0,26,13],pxconvexmeshdesc:26,pxvehiclewheelgraphchannel:[14,30],sensibl:30,nxshapedesc:23,puttosleep:[14,16],tirelatforcemag:30,string:[19,17],pxwheelqueryresult:[14,30],delta:[18,16],instal:[19,24,23],length:[14,17,18,1,23,11,26,30],unregist:27,wiki:30,generatemetadata:27,subsystem:[4,21],pxdefaultallocatorcallback:17,newbound:26,materialindic:26,mmateri:16,pxvehicledifferential4wdata:13,vehactor:30,solver:[13,18,12,20,7,8,9,10],edeleted_shape_1:6,outputstream:14,setuptanksimdata:30,groups2:23,below:[14,16,1,6,4,12,5,20,7,21,9,23,25,11,18,0,26,30],endif:[26,21],pxusercontrollerhitreport:[18,23],linearli:30,consum:7,rule:[16,17,18,19,1,21,7,29,25,11,27,30],pxvehiclewheel:[14,30],ebest_poss:30,pxcontactmodifycallback:[23,7],getdof:20,trace:1,track:[16,4,12,20,21,8,18,25,26,30,31],see:[20,7,29,25,11,12,14,17,0,1,23,26,30,2,4,8,16,18,19,6,21,27,28],pxdefaultfileinputdata:[14,19],binarymetadata:19,destructor:[19,0],createtrianglemesh:26,pxbounds3:[26,1,11],bodya:0,pxvehicleconcurrentupdatedata:30,gtreasureactor:29,preclud:30,setparam:26,elock_linear_z:16,mirror:6,getwheelrotationspe:30,distanc:[1,13,18,12,20,7,9,10,26],explor:[5,22],flt_max:0,vert:[12,26],heap:9,center:[16,18,12,1,7,26,0,30],getgrav:14,jump:18,humanoid:20,special:[14,16,1,17,18,19,20,7,8,29,23,32,11,27,0,26,30,15],setmaxangularveloc:12,dominance0:16,dominance1:16,cheap:29,retain:[16,15],thei:[14,17,16,1,6,18,19,20,7,21,25,29,23,0,11,28,30,12],inalloc:27,eignore_initial_overlap:1,them:[14,0,16,32,3,17,18,19,1,21,7,8,29,23,25,11,12,26,30],clear:[14,30,6],namespac:[14,32,27],creation:[13,17,18,12,26,0],getnbanyhit:14,reiniti:17,bounc:[18,0,7,25,26,30],subdivid:[29,23],setrigiddynamiclockflag:16,commerci:18,raycastsingl:14,px_def_bin_metadata_vclass:27,redistribut:10,necessari:[14,17,16,9,18,19,20,21,7,25,29,0,27,12,30,31],arc:0,delet:[14,23,18,12,21,29,25,11,0,30],ispenetr:1,tunnel:[18,24,26,7],epsilon:7,workaround:[12,7,23],getdefaultmateri:[14,26],domin:[14,0,10,13],due:[0,16,18,19,1,7,8,11,12,30,31],pxreal:[14,0,16,1,28,20,7,11,12,26,30],tirefrict:[14,30],euse_swept_bound:14,getgpudispatch:[9,21],cctfiltercb:18,pxcontactstreamiter:[14,7,6],contextid:32,forbidden:25,modular:30,finalizequeri:11,thi:[1,2,17,18,12,20,7,9,10,26,27,3,13],getscenequeryfilterdata:14,arm:0,type:10,nbsolverbodi:16,aforement:18,efront_right_wheel:14,useroverlaphitbuff:14,"0x7fff":12,pxmemzero:20,senderrormessag:14,pxhalfpi:26,pad:30,page:8,amort:[12,10,13],recip:27,split:[9,10,26,13,0],minor:[14,18,17],suspforceappcmoffset:30,pxcontrollerhit:14,getvertic:26,listen:[16,25,7,32],regardless:[14,16,17,18,29,11,26],ehas_drive_force_limit:0,candid:[29,11],trianglecount:1,createcollectionfrombinari:[14,19],unconstrain:0,baseflag:27,wheelcoord:30,findappobjectfromid:19,pxrigiddynamicflag:29,getmaterialindex:23,rise:[29,30],lambda:20,pxmeshqueri:[10,26],gfoundat:[4,9,32],gvehiclescenequerydata:30,wikihow:30,pxconstraintvisu:0,mnbconvex:19,snrepxserializerimpl:27,customtypepropertyinfonam:27,thought:[12,30],disadvantag:30,workload:21,independ:[14,16,9,18,12,21,29,19,30],eno_boundary_edg:26,pxarticualtionlink:28,escal:2,computevelocitydeltafromimpuls:[16,7],experi:[12,30],snowman:16,group4:11,ithread:14,group2:[23,16,11],group1:[23,16,11],shut:[10,13],usergethfshap:26,excess:[18,30],retriev:[16,1,17,18,12,20,22,29,24,26,0],drivable_surfac:30,old:[14,19,24,26,6],mmaxcompress:30,ehandbrake_control:14,bear:[0,30],sweephit:1,desc10row:26,veri:[20,7],releasebatch:14,gauss:[8,10,13],nbwheelqueryresult:14,dereferenc:29,runclang_window:27,bool:[14,16,32,23,4,28,1,29,11,18,27,0,26,30],pullei:0,book:30,interfac:[14,17,25,2,6,4,19,20,0,8,23,24,10,11,27,28,13],yet:[19,23,0],let:[16,23,18,12,7,8,29,26,0,30],setinvmassscale1:16,setinvmassscale0:[14,16],pxsimulationfiltercallback:[29,23],ejoint_limit:0,dyn:[14,16],maxim:10,inflex:29,call:[20,7,29,25,11,12,14,17,0,1,23,26,30,32,4,8,16,18,19,6,21,22,3,24,27,28],addactor:[14,16,23,29,26,11,30],overst:8,latenc:[12,30,21],maxit:1,susplinelength:14,must:[20,7,29,25,11,14,17,0,1,23,15,26,30,32,4,9,16,18,19,6,21,24,27,12],chosen:[0,30,11,7],registerrefer:27,immers:7,setlimitcon:0,usersweephitbuff:14,overcom:[12,16,30],mindist:18,pxcomputetrianglemeshpenetr:[24,1],mnbactor:19,pxjoint:[14,12,16,20,0],pxserializationregistri:[14,19,30,27],setcmasslocalpos:[14,0,30],exportextradata:27,viewer:2,pxboxgeometri:[14,23,16,26],setradiu:18,unbatch:11,rectangl:26,biggest:32,unwant:19,info:[30,1,27],imbu:16,bundl:3,pxparticlebas:14,hierarchi:[26,13,17],camberangleatmaxdroop:30,pxmax:12,pxclothfabr:14,unrealist:[18,12,30],rel:[0,16,19,20,7,9,26,12,30],reporterror:[32,17],set:[20,7,29,25,11,14,17,0,1,23,26,30,32,2,4,9,16,18,19,6,21,3,24,27,12],getnbwheel:14,gettimestamp:26,red:12,exhibit:[9,20],springmodifi:0,fseek:19,mindexbas:26,setsphericaljointflag:0,absent:9,slightli:[14,23,18,12,20,7,29,26,0,30],ang0:0,ang1:0,translatepxbas:27,filterflag:14,union:[0,26],pxstringtabl:19,pxvehiclecomputetireforcedefault:14,gui:32,snowbal:[18,16],took:12,tool:[8,24,2,27],quantiti:[0,1],bare:12,stack:7,skip:[18,19,1,11,6],aconvexshap:26,euser_releas:14,stop:[18,0,7,23,26,30],pxcreatejointconstraintswithshad:16,troublesom:18,last:[14,16,6,18,28,1,7,24,11,30],pxsceneflag:[14,9,7,13,12],tune:10,said:[29,16,30,26,19],degtorad:18,embp:29,pedal:30,conform:[30,21],hasnextpatch:7,greater:[18,28,1,7,26,30],setvisualizationcullingbox:2,invmassscal:[0,16,7],tempbuffercapac:9,known:[17,18,1,7,23,30],loss:[15,30],pxqueryfilterdata:[14,11,6],tank:13,nx_adaptive_forc:23,preset:10,solid:[16,1],egraph_type_wheel:14,"32bit":26,becaus:[17,16,25,3,9,18,19,20,21,7,8,29,24,0,23,12,26,30],mwheelssimdata:[14,30],removebroadphaseregion:29,pxdiagon:[23,16],fricpair:14,eprevent_climb:18,prematur:19,prop:27,meant:22,getbroadphaseregion:29,regard:[32,21],updirect:[18,30],plan:32,plai:[18,29,16,30,0],heapcapac:9,taken:[16,18,0,21,9,30],overlap:[10,7],treasureshap:29,scene:[20,7,10,17],mycontactmodif:7,pointrejectangl:30,directx:17,suitabl:[8,19,20],backfac:1,linearcapsulesweep:23,albeit:30,"function":[17,18,12,20,7,10,26,27],setupdirect:18,outstream:19,normalisedtireload:30,shape:[10,17],either:[20,7,29,25,11,12,14,6,0,1,23,26,30,32,4,8,16,18,19,21,24,3],getslopelimit:18,snippetgpuloadhook:4,low:[16,9,12,20,29,26,30],eshift_vertic:26,e6th_from_front_left:14,loop:[12,20,13,10],suffer:[18,29,31,0,12],has16bitindic:26,loos:[12,26],embed:27,driver:30,isvehicleinair:14,eresolve_contact:14,vc14win64:[3,5,24],createobject:27,copyinternalstatetocach:20,quickli:[29,11,13,12],frequenc:[12,16,25,30],usegeodesicteth:14,swing:[0,20,30],"return":[14,0,16,1,6,4,28,17,7,21,29,23,25,11,18,27,12,26,30,32],obtain:[17,16,1,6,4,19,20,21,0,26,12],make:[20,7,29,25,11,12,14,17,0,1,23,26,30,32,2,4,8,9,16,18,19,6,21,27,28],physxfoundation_:[3,24],driven:[18,0,20,30],look:[23,18,12,22,29,25,11,27,0,28,30],standard:[17,0,20,21,25,30],camber:30,tangentialstiff:14,specifi:[14,0,16,1,3,17,4,19,20,7,29,23,25,11,27,12,26,30],snippetcustomjoint:0,spg:30,pxvisualdebuggerflag:14,computepenetr:1,ehol:26,save:[14,16,23,4,12,29,11,27,19,26,30],rather:[14,0,1,3,6,18,28,20,22,9,25,11,12,30],discuss:[13,18,12,7,29,10,26,0],comput:[20,7,10],doblock:14,internalfaceindex0:7,internalfaceindex1:7,load:[14,16,13,19,23,10,26,12],convexdescpolygon:26,github:8,enon:[0,13],endors:15,pxcctnonwalkablemod:14,interpenetr:7,major:[8,9,26,17],estim:[0,20],vital:[8,30],continu:[10,17],toggl:[3,30,32],individu:[14,16,1,2,12,20,7,22,0,30],consult:[29,30],eeasi:18,getbroadphasecap:29,influenc:[14,16,30,7,32],eremove_unreferenced_vertic:6,groupcollisionflag:23,concept:[8,16,10,13],addforceatpo:[23,16],setlocalpos:[14,12,30,26],guid:10,nbrow:[12,26],setcontactmodifycallback:7,setsuspforceapppointoffset:30,nxcollisiongroup:23,previou:[14,16,6,19,7,29,24,11,12,30],spec:12,screenshot:[18,5],ellips:20,heightscal:[12,26],fifth:30,fifti:28,echannel_tire_lat_slip:[14,30],arrang:[14,26,21,22],about:[13,17,18,12,20,7,10,26,2],getactiveactor:16,externalrefer:0,gettaskmanag:25,eanalog_input__accel:30,high:[16,9,0,7,8,29,11,3,30],bufferoverflowoccur:1,convexbox:29,angl:[18,0,20,30],actor0:[14,0],pxmetadataflag:27,orthogon:[0,26],translat:[16,20,19,1,23,0,12,31],valid:[14,16,18,12,1,29,11,19,26,30],pxvec3:[14,16,1,17,18,12,20,21,7,9,23,11,0,26,30,31],less:[0,16,30,18,19,1,21,7,9,26,12,22],scope:[8,18],artifact:[12,30,0],createcloth:14,chassismass:30,setchildpos:20,eremoved_actor_0:[21,6],eremoved_actor_1:6,estatic_aabb_tre:13,smp:12,variant:[16,26,27],accord:[16,0,21,7,8,11],given:[14,0,16,1,9,19,20,21,7,29,25,11,12,26,30],author:[4,19,30],throughput:21,pathto30repxfil:19,dispar:1,isreleas:6,nbactor:29,pxdebuglin:2,etire_frict:14,judici:30,usererrorcallback:17,physxcooking_:3,without:[14,0,6,4,19,5,20,7,21,8,29,24,11,18,27,12,26,30,15],interchang:[26,6],fetchresult:[0,16,13,28,7,23,10,11,12],longer:[14,16,6,4,19,23,7,29,18,0,30],nbvert:26,getnbcolumn:26,categoris:30,myfiltershad:30,othershap:29,etireload:14,has16bittriangleindic:26,pxbatchquerydesc:[14,30,11],setprismaticjointflag:0,pxobstaclecontext:[14,18],edisable_active_edges_precomput:[12,26],eposit:[14,1,11],nbcontactpair:25,onpvddisconnect:32,erestitut:0,capac:[9,25,12],keep:[17,16,30,6,18,19,20,7,21,9,11,0,26,22,31],sm3:9,sourc:[17,12,5,23,19,29,15,26,27,0,30],heightfieldgeom:1,ps3:23,flippedcontact:7,move:[12,10,7],ruler:30,intempbuff:27,illustr:[14,16,32,18,19,5,1,25,26,30],gettirelateraldir:[14,30],pxcreatepvd:[4,32],construct:[14,16,17,0,20,26,30],exercis:30,enotify_touch_ccd:7,enumtyp:27,fast:[18,3,7,23,24,26,11,30],minimpuls:0,partak:0,shadow:1,"short":[12,30],black:12,depenetr:[12,1],configur:[12,20,7,10,17],setvisualizationparamet:[0,2],joint:[10,7],localframe1:0,localframe0:0,produc:[14,0,16,1,30,17,4,19,20,7,9,24,11,18,27,28,26,12],avoid:[32,6,4,0,20,7,21,18,25,11,27,30,31],anyth:[16,11],through:[12,20,7,17],propag:[30,20,31],articul:[10,7],posit:[10,7,17],addaggreg:29,bodybtoworld:0,localscaledvertex:12,indexbuff:26,instuct:26,stick:[18,30],convexvert:26,stale:11,strategi:[18,12,26,11,30],mem:14,nxcontactpairflag:23,second:[0,16,30,23,18,19,5,1,21,7,29,28,12],simul:[20,7,10,17],handl:[14,16,32,23,18,20,21,29,24,26,30],unoptim:12,reconnect:13,aren:7,debug:[10,17],createconvexmeshsaf:26,scenequeriesupd:11,"05_01":30,etank_wheel_7th_from_front_left:14,esuppress_eager_scene_query_refit:24,overwrit:21,gscene:[9,25,11],temporari:[9,25,11,7,12],euv:[1,11],pxvehiclesetsweephitrejectionangl:30,setvisualdebuggerflag:14,weaker:30,sluggish:13,huge:26,kind:[14,23,18,0,1,7,29,28,30],getfootposit:18,elev:[18,30],getconjug:20,meshdesc:[14,26],primari:[19,5,20,7,25,3],pxboxcontrol:18,extradatastream:7,overlapaabbshap:23,twistdriv:0,queue:[12,21],gain:[30,11],setprojectionlineartoler:0,sudden:0,px_serial_object_id_invalid:19,therefor:[0,16,32,9,19,20,7,29,23,25,11,27,12,26,30],match:[14,19,30],pxvehicleantirollbar:13,process:[14,17,16,1,13,9,28,20,7,29,0,26,27,12],droop:30,unexpect:12,pxsimulationfiltershad:29,compos:[8,17],abov:[16,1,9,18,12,6,7,21,29,23,11,27,0,26,30,15],pxactivetransform:24,physx_:3,flavor:[16,30],anywai:18,compon:[16,13,18,12,25,9,10,0],imag:5,gettirefrict:[14,30],myvehicl:30,concis:5,north:16,emax_distance_en:0,pxshapeext:[14,26,6],expand:[18,30,26],pxsolverconstraintprepdesc:16,nbcontact:7,differ:10,counter:[14,16,30],pxconvexmeshcookingresult:26,concaten:26,size:[14,0,30,2,17,4,19,1,7,9,23,25,11,18,27,28,26,12,32],consist:[14,0,1,17,4,19,20,21,23,26,12,30],customclasstyp:27,customtypemetadataobject:27,contactcach:16,nbheader:16,slowli:[18,19,7,13],pxcreatebasephys:[4,26],aspect:[12,25,26,21,27],freedom:[0,16,20,30],classifi:19,snippethellogrb:9,detail:[7,29,25,11,12,14,6,0,1,23,26,31,13,4,8,16,19,17,21,22,27,28],frictionpair:30,px1d:0,opposit:[30,1],glmultmatrixf:17,rendermateri:[26,21],bias:[0,30],dealloc:[19,25,11,27,17],how:[14,0,16,1,13,9,18,6,20,7,8,29,23,26,27,28,12],hardwar:[12,21],gettyp:[14,26],newton:30,setgroup:23,getwheelgraph:30,content:[13,4,19,5,21,18,10,12],polyhedron:26,pxplanegeometri:26,navig:[18,30],etank_wheel_front_left:14,nonwalkablemod:18,isparticlebas:6,ratio:[0,16,20,30],muscl:0,setphasesolverconfig:14,nonetheless:[8,18],x86_32:3,pxcontactpair:[14,7,6],glpushmatrix:17,getcurrentgear:30,target:[14,0,16,4,19,20,7,3,25,26,27,28,30],constantblocks:[14,29,30,7,23],mmaxomega:30,averag:29,esap:[29,24],twitchi:[0,30],self:[29,25],unavoid:30,settim:23,equantize_input:26,law:[8,0,16,30],elast:16,kinemat:[20,7,10],pxprismaticjointflag:0,door:[0,7],eacceler:0,meshactor:29,margin:[30,1],pxpvdtransport:[4,32],getstepoffset:18,doom:18,exact:[16,1,23,18,19,20,7,9,11,12,30],formerli:[14,23],setdominancegroup:16,pxarticul:[14,28,20],nxuserraycastreport:23,wait:[14,25,21],physxapi:[8,25],fetchcollis:25,vehwheelqueryresult:14,mfrontwidth:30,destroi:[14,17,4,0,20,8,29],cumbersom:31,small:[0,16,18,28,1,21,7,26,12,30],trip:30,unlockwrit:21,pxdefaultfileoutputstream:[14,19],pxbaseflag:27,thread:[10,7,17],year:18,wheelsimfilterdata:30,pxtransform:[14,0,16,1,17,12,20,7,29,23,11,19,26,30],pxbroadphasecap:29,collectiona:19,issu:[10,7,17],collectionb:19,enormalized_tireload:14,diagon:[23,16,26],computegraphchannel:30,detect:10,ediff_type_ls_frontwd:30,sooner:7,highli:[8,0],ggpuloadhook:4,pxmeshgeometryflag:[1,26],pxrepxinstantiationarg:27,removeobstacl:18,copyright:15,fail:[0,16,4,19,21,7,18,26,12,30],orient:[1,0,20,8,26,30],princip:16,dat:19,contrast:20,fragil:25,impleament:32,easili:[14,16,6,18,9,24,30],worri:7,otheractor:7,fundament:[29,23,0,28],newer:[14,19,24,6],merg:[14,24,13],algorithm:[10,7],had:[18,23,24,30,6],vcurrent:0,raycasthitbuffers:14,asset:[19,16],reserv:[17,19,7,23,15,11],pxsetasserthandl:17,problem:[18,12,21,7,25,0,30,31],staticstructur:11,pxtriangl:1,under:[14,13,3,6,18,12,7,8,9,0],emax_nb_wheel_channel:14,bulk:4,chain:[0,20,21,19,29,12],tireforceshaderfn:30,shrunken:12,gravitydirect:30,anew:16,querymemori:30,readi:[16,30,21],ppu:14,approx:30,win64:5,getgeometri:[23,26],unnam:19,demand:21,rigidbodymass:30,pxgeometrytyp:26,likelihood:[30,7],sqdesc:30,specif:[14,17,16,1,13,3,6,18,12,20,29,15,11,0],eprecise_sweep:[14,1],esuspforc:14,lower:[0,20,26,7,30],compil:[14,3,6,12,5,17,29,24,19],wheelqueryresult:[14,30],autom:30,appropri:[17,0,21,7,12,30],snippetvehiclenodr:13,empti:[14,16,6,18,29,27],bound:7,pxcreatecontrollermanag:18,power:[30,20],pyramid:[18,26],threshold:[0,16,20,7,30],pxvehiclegearsdata:13,place:[14,0,16,13,3,17,18,12,5,1,7,8,29,19],drivesimdata:30,stai:[8,29,16,23],mytelemetrydata:30,awai:[16,18,0,7,26,31],upright:[16,26],trigonometri:30,socket:[0,20,32],word3:[30,11],word2:[30,11],word1:[29,30,11],pxcapsuleobstacl:18,populateregion:29,recreat:[19,20,12],latter:[19,30,12],paper:7,accident:22,sweepdirect:11,interlud:30,edrive_model_standard:14,migrat:[8,10,13],materialindex:23,minvelocityit:16,keyhol:7,immov:0,hope:18,setfootposit:18,straightforward:[8,30,31],pxconstraint:[14,0,16],getnbwheelgraph:14,ediff_type_open_4wd:30,vast:2,analysi:[3,30],setcontactoffset:[12,7,23],setanaloginput:30,pxcontrollerbehaviorcallback:[18,23],prioriti:[0,21,23],pxinitextens:[4,23],hullpolygon:26,subsect:16,processcallback:25,brake:[0,30],focusvehicl:30,elarge_triangl:26,"50th":12,rpm:30,ecct_can_ride_on_object:18,multipli:[18,12,16,30,17],categor:12,obstaclehandl:18,samplenorthpolebuild:26,nbpositioniter:16,settorestst:30,vmotion:0,es16_tm:[12,26],px_profile_zon:24,idiom:17,edeleted_shape_trigg:6,mplane:26,gettirelatslip:14,geometricerror:0,setdriveposit:0,analyz:[16,25],procedur:[19,30],recommend:[17,32,3,6,4,12,20,7,29,24,23,11,18,0,26,30,31],"20kg":30,mtd:[12,1],suggest:[30,26],dampingratio:30,setgearup:30,over:[12,7],releaseobject:[19,6],woken:[14,16],cb2ca:20,drivetorqu:30,quicker:30,pxjointlimitcon:0,pxvehicledrive4wcontrol:14,guarante:[16,1,30,19,20,21,7,11,28,26,12],mbia:30,samplesm:26,abcdabcdabcdabcd:12,gkeysmoothingdata:30,own:[17,16,32,3,6,18,19,20,7,21,8,29,23,25,0,30],dir:30,pxscenequeryext:6,additionali:12,adapt:[23,27],subordin:27,e4th_from_front_right:14,slider:0,backward:[14,30,1,17],overlapshap:11,serializationregistri:30,getworldbound:[14,1],intervent:18,radial:16,enumer:[14,17,0,29,27,30],radian:[0,20,30],pxtolerancesscal:[32,17,4,7,24,26,30],recov:[18,19,12],approach:[0,16,6,18,19,20,7,29,23,25,26,12,30],emax_num_engine_channel:[14,30],rendersphereactor:26,fiber:14,clientid:14,mfrontbia:30,eanalog_input__handbrak:30,halfext:14,vehicledrivablesurfacetotirefrictionpair:30,background:[19,21],pxcontrollerfilt:18,simulationeventcallback:25,samples1:26,trivial:[29,23],streamlin:24,mcmoffset:30,pxvehicleclutchdata:13,createbatch:14,efront_left_wheel:14,compress:[14,30,26,7],act:[14,16,32,0,7,30],free:[4,12,5,20,18,25,0,30],pxu8:[26,27],nxshape:23,pxobserv:14,erigid_dynam:[29,19],fastest:26,hovercraft:30,isotrop:0,tri:[12,1,26,32,0],pxcreatecontactconstraint:16,maxedgelength:18,addlocalforceatlocalpo:16,msubmarineactor:[29,7],setgeardown:30,deliv:30,meet:[12,1,21,30],eenable_stabil:12,halfsideext:18,hyperthread:29,rendercapsuleactor:26,simd:[8,16,26],pxbroadphaseext:29,invalid:[3,10],degrad:[29,11],gpadsmoothingdata:30,multipl:[14,0,16,1,13,17,28,20,7,8,29,23,10,26,12],isrigidactor:6,theorem:7,placement:[12,30],snippetloadcollect:13,rigidactor:17,surfacetirepair:30,proport:[0,20,7,8,9,30],theoret:30,shdfnd:14,numcol:26,risk:7,etank_wheel_8th_from_front_left:14,emax_nb_drive4w_analog_input:14,patch:[19,16,30,21,9],evalu:26,pxmateri:[16,17,19,29,26,30],freed:[30,25],five:30,maintain:[16,17,18,19,20,8,11,0,30],pxrenderbuff:2,pxcreatephys:[32,17,4,6,23,25,26],tankdrivesimdata:30,espr:0,rebuilt:11,"60fp":12,oscil:[0,30],elaps:[18,28,23,12],numprimsperleaf:24,braketorqu:30,expens:[1,17,0,20,7,25,11,12,26,30,31],createpruningstructur:11,pxarticulationcach:10,"27m":12,raycastmultipl:14,einternal_error:17,registerdeletionlisten:14,pxcookingparam:[6,4,12,9,24,26],posetosweep:1,aboxactor:26,graviti:[14,13,17,18,20,7,9,10],expend:30,done:[18,28,20,21,22,29,11,27,19,26,12],pxcollectionext:[14,19,6],shorter:30,pxplaneequationfromtransform:26,visual:[10,7,17],reconcil:21,heavi:[9,16,30,0],big:[18,19,31],recipgrav:30,bia:30,gcook:[23,11],nxinitcook:23,period:[30,16,20],echannel_tire_long_slip:[14,30],highlight:[14,24,6],setmassandpreservenaturalfrequ:30,bit:[16,3,17,18,12,29,25,11,23,26,30],flag:[20,7],essenti:[23,30],balloon:23,paragraph:11,shorten:11,clutch:13,tighter:[12,30,26],contrari:[18,7],nx_physics_sdk_vers:23,torsion:30,certain:[14,16,17,4,12,20,7,22,29,0,30,31],app:[28,30,3],trajectori:12,pxfilterdata:[14,29,23,7,13],unnotic:0,pxfindoverlaptrianglemeshutil:[14,6],arcad:7,echeck_zero_area_triangl:26,othercompletiontask:25,medium:21,probabl:[18,29,30,28],worldvertex:12,spheregeom:1,broadli:11,offer:[16,2,23,19,5,20,22,29,25,11,12,30,31],prompt:[3,5,24],defin:[1,2,17,18,20,8,9,10,26,27,13],unifi:[14,12,24,6],shiftorigin:31,accur:[14,16,1,18,0,20,7,8,30],raycasttouchbuffers:[14,30],getobject:19,natur:[18,0,25,30],edeleted_shape_oth:6,time:[12,20,7,10,17],swim:7,pxvehicledifferentialnw:30,flatter:30,cpudispatch:[9,13],wheelshap:30,worst:11,itself:[32,18,7,23,25,30],acycl:0,bottl:18,were:[14,16,1,23,19,20,21,7,29,25,11,27,28,26,30],"\u03c0":0,rework:[14,30],ecommit_disabled_build_dis:11,seek_end:19,setclutchdata:30,setcontinu:[25,21],esuppress:[29,30,23],pxd6jointdriv:0,mathemat:[0,30],label:30,depth:10,getfaceindex0:7,scroll:30,pxvisualdebugg:[14,6],pxphysx:14,shareabl:[14,19,6],graphsizex:30,pxvehiclesetmaxhitactoracceler:30,setrevolutejointflag:0,buffer:7,pxfixedsizelookupt:30,bitmask:11,"_build":8,design:[13,17,12,20,8,3,10,0],postfiltershad:11,uncontrol:30,repxcollect:14,pxqueryflag:[14,13],nbtri:26,px_def_bin_metadata_extra_item:27,peculiar:16,static_cast:[19,16,20,11],breakabl:[0,20],onpvdconnect:32,setmaxdist:0,getsimulationstatist:22,actorstream:19,flat:30,getnumwheelgraph:14,schedul:[12,30,21,23],pxprocesspxbasecallback:[23,27],quantiz:10,mfronteftrightsplit:30,forinternalus:0,eanalog_input_accel:[14,30],pxd6motion:0,substep:[12,23,10,7,13],meantim:14,etank_wheel_4th_from_front_left:14,rudimentari:8,web:30,constant:[16,18,0,29,11,28,30],throttl:30,angvel:16,typenam:[25,17],initialpos:11,pxdefaultpvdsockettransportcr:[4,32],opt:30,mdampingratefullthrottl:30,setanalogbrak:30,edit:30,pxbvh33midphasedesc:[12,24,26,6],px_serial_align:27,pxsweepcallback:[14,11],current:[7,29,25,11,12,17,3,20,26,30,32,4,8,9,28,16,18,19,21,22,27,0],tall:7,graphsizei:30,code:[20,5,7,29,11,14,17,28,1,23,15,26,13,8,9,10,16,18,12,3,24,27,0],unpos:30,pxoverlapcallback:[14,11],pxjointlinearlimit:[14,0],main:[14,1,6,18,19,20,21,8,29,25,11,28,30],limb:[18,7],restoffset:7,align:[17,18,12,1,19,29,23,25,0,30],maxjumpheight:18,arbitrari:[23,18,1,7,29,11,30],vari:[12,1,30,0],need:[7,29,25,11,12,14,6,0,1,23,26,30,32,4,8,16,18,19,17,21,24,27,3],pxvehicleupd:[14,30],jointforc:20,geom:[30,1,26],pxheightfielddesc:[12,26,6],pxgpuloadhook:[24,13],trgwheel:30,share:[20,17],osx:[5,27],pxcreatedynam:19,massless:16,www:30,hasblock:14,sample_fre:29,pxrigiddynamiclockflag:16,px_new_repx_seri:27,lock:[20,13,10],templat:[23,27,6],subject:30,wheelmoi:30,eassume_no_initial_overlap:[14,1],treasur:29,perhap:[12,30,6],drivabl:13,liberti:30,myscen:30,uservalue0:12,spin:7,pxspatiallocationcallback:23,format:[14,17,19,5,7,25,26,12,30],physxvehicl:[14,3,27],chassisshap:30,around:[0,16,1,18,12,20,8,29,26,19,30],incompat:[19,1],pxvehicleautoboxdata:13,px_max_real:16,lighten:12,chunk:[29,19],nx_default_sleep_energi:23,ongo:25,edeleted_actor_1:6,edeleted_actor_0:6,prevent:[17,18,12,21,7,25,11,0,30],solverprep:0,maxhit:[14,1],mminestoexplod:7,snippetvehiclecontactmod:30,subclass:[4,21,7,23,25,11],sequenc:[0,16,13,17,28,10,12],necessarili:[18,7],lighter:[0,7],chassisdata:30,writeallproperti:27,parentpos:20,represent:[17,19,8,26,0,30],validateconvexmesh:26,resweep:7,const_cast:30,pxvehicledrivablesurfacetotirefrictionpair:[14,30],queu:[11,21],sleep:10,pxvehicle4wenable3wtadpolemod:[14,30],gimbal:[0,20],nxuserentityreport:23,"\u215b":24,mine:[29,7],pxcapsuleclimbingmod:18,satur:30,idea:[12,30],inwrit:27,roll:[18,13],when:[20,7,17],applydamag:11,pxbase:[14,0,17,19,6,27,28],abl:[14,0,9,16,1,6,4,19,20,7,21,29,24,25,26,18,12,30],pxphysicsinsertioncallback:[4,12,6],writer:21,easiest:16,signal:[14,25],tend:[18,19],projecttoa:0,getvisualdebugg:14,updateb:25,pxquerycach:[14,11,6],corner:[18,12,1,13],graph:[0,25,30,17],updatez:25,border:26,eanalog_input_brake_left:14,pvddatastream:32,alon:[8,12,25],intuit:[0,30,11],divert:30,belong:29,connector:0,getnb:27,pxscenequeryhit:[14,6],compat:[14,3,17,19,1,9,12,30],ecct_slid:18,achiev:[14,0,16,9,4,19,23,21,7,29,26,27,12,30],convexgeom:29,physxcor:17,phors22:30,etank_wheel_4th_from_front_right:14,increas:[16,32,18,12,21,7,0,30],diverg:12,assumpt:[30,7],strip:[4,26],pxvisualizationparamet:[0,24,2],etank_wheel_7th_from_front_right:14,px1dconstraintflag:0,einternal_has_impuls:7,answer:32,gear:13,inrigiddata:16,other:[20,5,7,29,11,13,14,17,0,1,23,15,26,2,8,9,10,16,18,12,6,27,3],squar:[16,30,26],setbreakforc:0,addforceatlocalpo:16,physx_64:24,sprungmasscoordin:30,sit:30,pxqueryfilt:14,stepoffset:18,caterpillar:30,edeprecated_32_compat:6,ihit:14,setglobalpos:16,sim:[10,13],fopen:19,afterthought:23,region:[10,13,1],ascen:26,gentli:30,weight:30,lowest:[30,21],getphysxgpudllnam:[4,24],copi:[14,19,20,7,22,29,25,11,0,26,30],worthi:25,launch:[14,5,32],worker:[12,16,25,21,17],pxtriggerpair:29,seri:[30,27],vice:[29,12],"boolean":[23,1,11,21,30],pose2:1,platform:[16,17,18,12,5,23,24,26,27,3],actor:[12,20,7,10,17],wasn:23,ident:[14,16,17,0,1,29,26,19,30],vehicle18w:30,slice:26,getdistancelimit:24,combin:[16,1,23,18,0,20,7,22,9,11,27,3,30],outcom:16,onwak:[16,25,21],nxutillib:13,setdigitalsteerright:30,critic:[19,30,21],formal:[14,19,30],bypass:[29,16,11],mphysic:[4,14,16,21],createmateri:[19,16],createlink:20,refus:13,getrenderbuff:[23,2],veh4wactor:30,pxcontactrecord:16,pxscenequerycach:[14,6],symbol:23,microsoft:[3,24],shaperenderactor:26,uncaught:12,reli:[4,23,25,27,30],tireid:30,leav:[18,19,21,7,11,30],quit:[14,16,18,12,7,30],gen:6,account:[16,18,12,20,21,7,9,25,30],filetoobject:27,port:[18,23,32],lead:[16,18,19,20,21,7,8,11,12,30],leaf:26,eprefilt:13,config:[14,24],soon:[18,29,1],pxvehicledrivesimdatanw:30,lean:30,settireforceapppointoffset:30,etank_wheel_2nd_from_front_left:14,getphys:[14,29,26],indirectli:[16,21],moment:[0,16,30,7],"float":[17,18,12,7,23,26,0,31],px_unus:30,etank_wheel_5th_from_front_right:14,ebvh34:[12,26],pairhead:[25,7],snippetseri:13,wherea:16,typic:[16,17,18,19,20,21,7,29,25,11,0,26,30],markdirti:0,latest:21,rigid:[20,7,10,17],linkpos:20,four:[3,1,21,30],purpos:[2,17,18,19,1,8,29,15,0,30],mytrimesh:26,updatemassandinertia:[14,16,20,26],pxd6axi:0,startpos:30,wherev:20,shoulder:0,pole:16,human:[19,28],entrant:21,pure:[0,30],quarter:[26,7],spike:[12,22],nxcapsuleshapedesc:23,telemetrydata:30,poli:16,behavior:[10,17],releaseexclusiveshap:6,pxvehiclewheelqueryresult:[14,30],numer:[18,12,1,30,23],tell:[29,26,11,7,30],num_vehicl:30,guess:30,perpendicular:20,point:[2,17,18,12,20,7],vertexlimit:26,uservalue1:12,samplebridg:18,also:[20,7,29,25,11,12,14,17,0,1,23,26,30,31,32,8,28,16,18,19,21,27,3],respons:[14,16,17,18,19,7,9,11,27,0,26,30,31],densiti:[23,16,26,11,30],getmaxnbcontactdatablocksus:[12,25],primit:[26,2],doe:[14,17,9,16,2,6,18,19,20,25,7,21,8,29,24,0,11,12,26,13],blockdist:11,pxdefaultsimulationfiltershad:[29,23,9],respond:30,sinc:[14,0,16,32,2,6,18,19,17,7,21,8,23,25,11,28,26,12],quantizedcount:26,prismat:[20,13,10],begun:21,etransmit_scenequeri:32,sink:[26,7],eanalog_input__brak:30,meshgeom:1,sent:[14,32,18,19,21,7,25],damag:[12,15,30,7],tradit:7,collectforexport:14,dispos:4,sens:[17,19,1,7,25,0,30],term:[12,20,7,26,0,30],gaug:30,pxcach:16,select:[18,12,1,7,24,11,30],send:[14,16,32,6,12,25],mdampingr:30,nx_sta_frict_sc:23,factor:[16,2,18,12,23,11,0,26,30],maccuraci:30,trigeom:29,referenc:[19,23,17,21,28],damp:[14,16,0,20,23,30],simplist:[29,25,30],ith:26,deltaheight:12,pxmat44:[14,17],wake:[14,0,16,25,24],actorinputdata:19,setdistancelimit:[0,24],pxdominancegrouppair:[14,16],geomtosweep:1,pxagain:11,setrunonspu:14,favor:[23,20,21],triangleindex:26,contract:15,member:[16,17,12,1,7,22,29,11,27],d3dmatrix:17,numchassismesh:30,convexcollect:19,pxarticulationjointdrivetyp:20,incorrect:[0,16,1,11,3],befor:[14,0,16,1,6,18,19,20,7,21,29,23,25,11,27,12,26,30,32],transport:[4,32],setinvmassscal:[14,12],often:[16,32,17,18,12,20,21,7,29,25,11,0,26,30],createarticul:20,repx:10,entir:[0,16,3,23,19,20,21,7,9,11,12,26,30],hadinitialoverlap:[14,1],angular:[16,12,20,7,0,30],capsul:12,bodi:[20,7,10,17],solverconstraintdesc:16,edist:[24,11,6],programm:21,deviat:30,einflation_incremental_hul:[24,6],"33m":12,tort:15,order:[14,17,16,1,6,4,19,20,7,21,25,29,24,0,11,18,27,12,26,23],activeactor:16,incorpor:0,erear_right_wheel:14,forcestreamcapac:9,dcc:19,setanalogaccel:30,profit:15,isol:19,ereport:6,ediff_type_open_frontwd:30,mostli:30,sizeof:[23,12,20,29,11,27,26],job:[25,21],profil:17,stuck:[18,12,30],smoother:[30,7],edrivetank:14,"true":[14,0,16,1,3,9,4,19,20,21,7,29,23,25,11,18,28,26,30,32],cell:[26,11],anymor:[14,16,18,19,24,11],correspond:[14,17,1,2,3,6,4,19,20,21,23,9,24,0,11,27,28,26,30,31],pxu16:[0,30,26],everi:[16,32,17,4,19,20,7,22,8,0,30],clang:[24,27],camera:[5,30],setanaloghandbrak:30,worth:30,sourcewheelid:30,hasnextcontact:7,createvehicleactor6w:30,maxnbobjectsperregion:24,"5x5":26,pxdeserializationcontext:27,pxtrianglemeshcookingresult:26,"transient":[9,30],wheelwidth:30,exchang:23,setmaxdepenetrationveloc:12,hfdesc:26,kei:[25,23,20,30],suppress:[14,0,16,1,11],state:[14,13,17,18,12,20,7,8,9,23,0,31],pxvec2:17,createreducedcoordinatearticul:20,oppos:[4,19,24,0],invok:[18,23,30,11],dead:0,been:[14,0,16,1,30,6,4,19,20,7,21,25,29,24,23,11,18,12,26,22],statu:[26,1,11,21],emax_num_drivetank_analog_input:14,uniqu:[18,19,30,27,6],met:[0,15,21],enotify_touch_persist:24,getnbtriangl:26,elimits_are_forc:0,pxvehiclemodifywheelcontact:30,view:[32,2,18,5,21,8,30],pushback:14,gtirefrictionmultipli:30,mdampingratezerothrottleclutchdisengag:30,decoupl:[12,30,23],econstrain:18,mind:[0,30,22],relev:[23,16,30],physxgpu:[4,3],pxconstraintconnector:[0,23],disablewheel:30,penalti:[1,26],addclient:32,distinguish:[14,11,21],pxconvexflag:[9,26,12],fewer:[11,7],divid:[14,30,26],pxphysic:[14,16,32,6,4,19,17,7,29,23,11,0,26,30],got:[24,21,22],pxtransformfromplaneequ:26,contactdist:[0,16,20,7,17],consol:[19,21,17],deepli:[12,16],meshpos:1,why:18,eanalog_input_steer_left:[14,30],pxjointlimitpair:0,dst:7,goe:7,wheelqryfilterdata:30,e8th_from_front_left:14,pxarticulationaxi:20,word0:[14,29,30,11,23],findoverlaptrianglemesh:1,onadv:6,reduc:[10,7],record:[14,16,32,2,19,20,8,25,11,0,30],numwheel:[14,30],area:[2,18,21,26,11,31],mupratio:30,pxcontactmodifypair:7,abp:[29,9],framework:[28,5],singl:[14,17,1,13,29,0,6,5,20,7,9,23,10,26,12],triindic:26,pxvehiclepadsmoothingdata:30,instantan:[16,21],maximum:[14,16,1,3,9,18,12,20,7,29,23,25,11,0,26,30],deltaangularveloc:[16,7],fetch:[23,22],dotouch:14,wheelcount:14,eggert:30,createrenderobjectfromshap:26,susplinedir:14,modif:10,emine_link:29,welcom:[10,13],similarli:[14,1,23,18,0,20,21,7,29,32,11,30],pre:[14,17,19,23,24,25,11,27,26],render:[2,17,18,12,5,10,26,13],reason:[0,16,17,4,19,20,21,7,18,23,25,12,30],output:[32,3,17,4,0,5,1,29,24,26,27,19,31],reinforc:8,angular1:0,pxvehicledrivedyndata:30,getconstraintflag:0,pxraycastbuffern:14,onccdcontactmodifi:7,vulner:11,eno_block:[14,13],snippethelloworld:10,ediff_type_ls_4wd:30,gettetheranchor:14,intel:17,nxuseralloc:23,physxgpu_x64_renam:4,pxvehicle4wenable3wdeltamod:[14,30],integ:[12,26,17],older:[14,19,24,6],pxclientbehaviorbit:14,longitudin:30,logic:[23,12,17,29,25,11],where:[20,7,29,25,11,12,14,17,0,1,23,26,30,31,2,8,16,18,19,21,27,3],poor:[18,30,7],color:30,px_force_inlin:[14,30],ejoint_fram:0,filtergroup:29,insert:[10,7],submittask:21,ediff_type_open_rearwd:30,establish:19,d3dxmatrix:17,car:[18,7],off:[16,18,0,7,29,26,3,30],transform:[14,0,16,1,17,12,20,9,24,23,11,28,26,30],repxserializerimpl:27,px_call_conv:30,transfer:32,destin:19,myccdcontactmodif:7,xmlmemoryalloc:27,fix:[13,18,12,20,9,10,26],tireforceshaderdata:30,oper:[14,32,17,19,20,21,7,29,11,12,26,30,31],gettireloadfilterdata:30,linkag:30,etank_wheel_6th_from_front_left:14,pxscenequeryhittyp:14,etank_wheel_9th_from_front_right:14,turbin:16,emerg:[30,1],setdigitalbrak:30,fit:[18,15,1,11],outputflag:11,getpolygondata:26,oncontrollerhit:18,mnumratio:30,exploit:30,mediat:30,ensur:[14,17,9,16,1,6,25,19,20,7,21,29,0,26,12,30],nextcontact:7,phor:30,open:[19,30],overlai:2,meter:[16,30,17],reorgan:14,surround:[18,30],"default":[20,7,29,11,12,14,17,3,1,23,26,30,32,4,8,9,16,18,19,6,21,24,27,0],snapshot:[19,20,30],untest:1,mpeaktorqu:30,yangl:20,inher:30,incident:15,axl:30,pxvehicleackermanngeometrydata:13,esteer_right_control:14,semant:[14,7,13],repxidtorepxobjectmap:14,treat:[16,0,1,7,11,30],meaning:[30,1],pxps3config:14,slippag:30,two:[14,0,16,1,9,18,19,20,21,7,8,29,23,25,11,27,12,26,30],properli:[16,18,12,22,29,11,19,30],finish:[28,25,11,21,30],liabil:15,swingdriv:0,axi:[14,1,13,23,18,0,20,7,29,10,26],pxrigidbodydata:16,inspect:[8,19,30,32],updatea:25,max_id:19,regress:12,across:[19,30,11],commun:18,repxobject:14,undefin:[14,1,18,19,20,11,0,26,30],depict:30,esolve_contact:7,especi:[14,12,21,7,29,11,3,30,31],overload:[14,17],plateau:30,advis:[15,30,27],startaft:21,exit:[25,1],addition:[32,18,21,8,29,25,11,30],mcrab:21,emax_num_drive4w_analog_input:14,tirelongforcemag:30,pxcontrollerdesc:18,stretch:[14,12,26],getglobalpos:[14,16,26,17],dot:[14,6,12,1,23,24,0],createobstaclecontext:18,getscenepvdcli:32,ccdthreshold:7,collect:[14,16,13,6,18,5,8,29,11,27,26],attribut:[0,21,29,26,11,30],eanalog_input_brak:[14,30],tireload:30,nx_bounce_threshold:23,museautogear:30,carrawinput:30,setposit:18,setangularveloc:[14,16,25],movement:[16,1,18,12,20,29],pxbatchquerypostfiltershad:11,lowend:27,relationship:[16,19,20,0,28,30],pair:[20,7],pxlocationhit:14,coupl:[18,29,30,7,22],customproperti:27,gettiredrivablesurfacemateri:14,grip:30,landscap:30,highest:[0,30],collis:10,forcedynamictreerebuild:11,"const":[14,16,1,2,6,4,12,17,7,21,29,23,25,11,18,27,0,26,30,31,32],perform:[10,17],dominancegroup:16,px_physics_vers:[4,24,32,17],displac:[18,30],updatevehicl:14,materialindex0:[12,26],repeat:[12,30],px_max_sweep_dist:[1,11],desc:[14,18,12,7,23,11],decompos:[14,0,20],eany_hit:[14,13],nxusercontactreport:23,sever:[14,16,1,9,18,12,20,29,25,11,27,30],sample_new:26,simpl:[14,0,16,1,17,18,19,5,20,7,8,29,11,12,26,30],analyt:[30,20],fraction:16,eanalog_input_steer_right:[14,30],nextpatch:7,epostfilt:13,getdiffdata:30,rag:[19,20,21,0],indic:[12,20],mayb:[18,12,22],maya:19,convexstream:19,snippetvehicletank:13,cookconvexmesh:26,rai:[8,23,1,11,7],neglig:[15,6],shaderdata:30,snippetjoint:0,partial:[16,0,21,22,9,30],raw:30,mmoi:30,gphysic:[9,30,11],dream:30,alreadi:[0,32,18,12,5,21,29,11,19,26,30],getgeometrytyp:23,samplebuff:26,generateexamplefil:19,dread:0,pxvehiclewheelssimdata:[14,13],sensit:[29,17,28],eanalog_input_handbrak:[14,30],stage:[12,30,21,3],hand:[17,19,5,25,0,30],characterist:[12,30,11,0],initi:[17,18,12,20,7,9,26],neutral:30,activegroup:11,wheelssimdata:30,between:[20,7,29,25,11,12,14,17,0,1,23,26,30,4,8,16,18,19,6,21,24,28],echannel_wheel_omega:[14,30],unlock:[20,13],traffic:7,know:[1,23,18,20,29,26,30],pxvehicledrivetanksmoothanalograwinputsandsetanaloginput:30,birth:0,pxsamplesubmarin:29,newjointposit:20,assert:10,vehicl:[13,3,12,7,8,9,10,0],formula:[30,1],compar:[6,18,12,1,7,9,24,26,23,30],subset:[4,0,20,11,3,26],strict:[15,30],cudamanag:24,irrespect:[9,16],"throw":16,direct3d:17,remaind:30,manoeuvr:30,pxerrorcod:[14,17],createzero:14,everywher:[4,18],isspher:23,ackermann_steering_geometri:30,etank_wheel_5th_from_front_left:14,mysteri:3,come:[16,12,21,7,8,29,3,30],bump:[18,30,7],notifi:[29,25],wheelgraphposi:30,violat:[0,20],mcook:4,standalon:[24,26,6],deadlin:12,body0vel:0,wide:[0,16,20,30,23],bottom:[18,16,30],createrigiddynam:[14,29,16,30,26],manag:10,pxscenequeryflag:[14,6],ecommit_disabled_build_en:11,physxgpu_:3,closest:[12,1,13,0],phors21:30,e8th_from_front_right:14,prototyp:[29,30,19],harder:30,serializecollectiontobinari:[14,19],encompass:8,explicitli:[14,16,1,2,19,20,21,23,24],recogn:17,budget:30,pacejka:30,elock_angular_x:16,edisable_mesh_valid:26,pxvehiclepostupd:30,holder:15,capsuledesc:23,largest:18,setinverseinertiascale0:14,obsolet:6,wheelmass:30,perman:30,doanyhit:14,galloc:11,crabpo:21,interfer:30,enotify_threshold_force_found:7,reader:[8,21],getcontrol:18,settessflag:12,lose:[29,30],dru:11,lockread:21,taskid:21,snippetprunerseri:11,exemplari:15,defer:[30,11,17],steerangl:[14,30],bumpi:30,triangleindexbuff:1,push_back:[14,7],similar:[16,1,18,12,20,23,11,0,26,30],userflipedg:12,lost:[14,16,32,19,7,9,26],like:[14,0,16,30,17,18,28,20,7,29,23,25,11,27,19,26,12,31],cb2w:20,userprofilercallback:32,unambigu:30,roll_bar:30,ememori:[25,32],pxzero:[14,0,17],row:[16,17,12,26,0,30],suffix:3,manipul:[23,16,21],getnbbroadphaseregion:29,encapsul:[29,26,12],inlin:23,pxdefaultmemoryinputdata:26,px_profile_stop_crossthread:24,decreas:[12,16,30,7],singleton:24,focus:30,emesh_multipl:1,pxinputdata:[14,19],pointdist:1,our:[16,32,4,3,8,11,12],pxcollectforexportsdk:14,out:[14,16,17,4,12,21,7,29,15,11,18,0,26,30],mstiff:30,induc:30,sqresult:30,sharabl:19,pxgetgaussmapvertexlimitforplatform:[24,6],mcamberatmaxcompress:30,dangl:19,pxvehicletiredata:[14,13],getlin:2,sweepresult:30,addit:[17,18,12,20,7,9,10,26,27,3],section:[14,16,17,18,12,20,21,29,23,25,11,19,26,30],path:[14,16,1,4,19,20,3,24,11,0,26],gnbqueryhitsperwheel:30,sharp:26,pxsweephit:[14,1,11],nxjointlimitsoftdesc:23,denomin:30,conceiv:30,pxcontrollercollisionflag:[14,18],central:7,mscene:[28,16,32,21,23],halfheight:[18,23,26],setdigitalaccel:30,trackoutstandingalloc:[4,25],nxustream:23,multithread:[8,10,13],nxscene:23,unmodifi:1,ekil:[29,12],substepcount:25,versu:30,srcmetadata:19,simpli:[14,0,16,1,2,6,4,28,20,25,29,24,23,11,18,12,30],determinist:[12,16,17],pxjointconcretetyp:0,object:10,ecooking_perform:26,getconcretetyp:[14,19,17],wheelssimdata4w:30,twist:[0,20,23],versa:[29,12],setjointtyp:20,win:[0,24,3],vehicleid:30,freespin:0,gpu:10,nx_assert:23,secur:13,recomput:[12,30,11],eremoved_shape_trigg:[29,21,6],penal:30,reorder:[16,26],pxdefaultpvdfiletransportcr:32,pxfoundationvers:24,mcudacontextmanag:21,counterpart:[14,18,30],terrainwidth:12,pxcloseextens:4,index:[1,13,18,12,20,10,26,27],counteract:20,eblocking_hit:14,setflag:29,efix:20,pxcollect:[14,19,27],six:[0,30],minimis:30,http:[30,32],frictiontyp:14,pxconstructsolverbodi:16,physxmetadatagener:27,zone:[30,32],pxscenedesc:[14,17,16,9,12,6,7,21,29,24,25,11,23],echannel_norm_tire_aligning_mo:[14,30],setsubstepcount:30,setmassspaceinertiatensor:[16,30],pxvehiclechassisdata:30,gdefaultallocatorcallback:4,assess:23,mwheel0:30,getsusplinedir:14,extractcontact:[14,7,6],gettirelongslip:[14,30],pxvehicledrivesimdata4w:30,should:[20,7,29,11,12,14,17,0,1,23,26,30,32,2,8,9,16,18,19,6,21,3,24,27,28],emesh_ani:[1,11,6],samplesubmarinefiltershad:29,outward:[30,1],createaggreg:29,efront_left:[14,30],d6joint:0,distro:8,potenti:[14,16,32,18,19,1,21,7,29,11,12,30],"30hz":12,getsimulationfilterdata:14,wheelmap:30,setsuspensiondata:30,verbos:19,former:[14,23,30],combo:14,setsimulationfilterdata:[14,29,16,30],filterdata1:[14,29,30,7,23],cpu:[16,9,12,17,21,8,29],remark:[14,30],substeps:25,cpp:[5,26,27],meshpreprocessparam:26,rang:[14,16,1,12,20,7,8,23,11,27,0,26,30],singular:0,customtypeautogeneratedmetadataobject:27,perfectli:[18,30,25,21,7],thrust:30,matrix:[23,16,20,17],createvehicleactornw:30,etrigger_default:[14,29,7,23],waitforquit:14,pxconstraintshadert:0,quitissignal:14,pxsetphysxdelayloadhook:4,deactiv:0,pxscenequeryfilt:14,isrigiddynam:6,setscenepvdflag:32,sprungmass:30,matric:[14,23,17],busi:15,owner:[15,27],capabl:[9,23,12,17,8,29,25],anb:12,speedup:11,ever:[12,16,25,30],skateboard:30,inner:[12,30],pxcontrol:18,createserialobjectid:19,ani:[14,0,16,1,13,9,18,28,5,20,7,8,29,23,26,12,15],mlongitudinalstiffnessperunitgrav:30,totallinkcount:20,persistentcach:11,mpvd:[4,32],eanalog_input_brake_right:14,collisiongroup:23,field:[18,20,17],discover:21,addtorqu:[14,16,25],circumst:[12,16,7],pxfiltershad:0,hidden:10,pxcreatejointconstraint:16,apex:[21,6],invisiblewallheight:18,sub:[25,29,23,30,12],sum:[30,20,7,31],awak:[0,16],eequal:0,pxvehiclecomputesprungmass:30,criteria:20,interpol:[12,30,0],disclaim:15,chassissimfilterdata:30,ignor:[14,16,1,6,18,19,17,21,29,0,30],awar:[18,12,1,21,23,30],howev:[14,17,16,1,2,9,18,19,20,25,7,21,8,29,23,0,26,12,30,31,15],front:[30,1,17],version:[14,3,6,18,12,5,17,7,8,29,24,11,23,26],eswing2:0,eclutch_slip:14,eswing1:0,srcassets:19,getforc:0,kilogram:30,wikipedia:30,mix:[3,26,21],statement:[26,7],quot:30,transit:[14,16,30,6],entiti:[8,29,18],ccdfiltertest:14,pxvehiclecopydynamicsmap:30,"\u215e":24,script:[3,24,27],effect:[14,0,16,6,18,28,1,7,23,24,11,12,26,30],phenomenon:18,min:20,px1dconstraint:0,sixtieth:28,opportun:30,srcbinfil:19,edisable_ccd_resweep:7,program:[8,4,5,17],indirect:[15,30],remap:26,settwistlimiten:20,gsharedindex:25,kept:[18,0,16,1],pxtypeinfo:27,"81f":9,bend:14,pxsolvercontactdesc:16,upgradecollect:14,undrivable_surfac:30,condit:[0,1,21,7,3,15,19,12],nxusernotifi:23,phxsx:19,echannel_tireload:[14,30],twice:[19,30,11],registercustomclassrepxseri:27,messag:[14,9,32,17,5,29,25],hfscale:26,processor:29,inherit:[14,17,0,23,26,27],pxvehiclesuspensiondata:13,efast_inertia_comput:26,chassi:30,materialindex1:[12,26],e7th_from_front_left:14,e1st_from_front_right:14,expir:18,quad:[0,1],pxfrictiontyp:14,entri:[29,20,30,17],ascii:19,filterdata0:[14,29,30,7,23],librari:[14,1,3,6,4,19,17,25,21,8,18,24,23,11,27,0,26,31],constitut:7,physxcook:[4,14],etank_wheel_1st_from_front_left:14,centroid:1,nbcolumn:[12,26],devic:9,setactorpairflag:23,serv:[9,11],pxsetphysxcookingdelayloadhook:4,pseudocod:14,sampl:[10,7,17],"75f":[14,30],pxsimulationstatist:22,devis:30,collision_flag_wheel_against:30,heightsampl:12,pxcreatefound:[4,23,24,6],internaldriveiter:20,setinvinertiascale1:16,predict:[18,16,30],eimpact:14,memorybuff:27,found:[14,17,1,29,18,19,5,20,21,9,25,11,27,0,30],automat:[14,16,32,18,19,5,20,21,7,29,24,25,11,27,0,26,30],px_max_nb_wheel:[14,30],funtion:14,pxvehiclewheelssimdta:30,eengine_drive_torqu:14,stamp:30,evehicle_type_nodr:14,setanalogst:30,hopefulli:30,argument:[14,1,17,0,6,29,25,11,26,30],myconvexmeshshap:26,esend_sleep_notifi:[16,25],error:10,chassisconvexmesh:30,renderobject:16,bvh:11,pxvehicletelemetrydata:[14,30],complet:[14,13,18,28,1,7,10,12],glass:1,rotat:[16,13,17,18,0,20,23,26],accuraci:[18,12,16,20,30],tirecontactactor:14,nonphys:0,renam:[14,23,24,6],isinair:[14,30],eenable_gpu_dynam:[9,12],scienc:30,dynam:[10,7],px_generate_static_librari:3,chassiscmoffset:30,thing:[18,19,7,25,12,30],rowscal:[12,26],"80kg":30,think:[18,30],batchqueryfilterdata:11,resolv:[14,0,16,18,19,21,7,26,27,12,30],camberangl:30,mitsubishi:30,involv:[14,0,16,13,9,28,23,7,29,11,19,12],setseparationtoler:20,tree:20,applic:10,estandard:[14,30],linkgeometri:20,param:26,px_window:21,caution:[12,30],maxnbstaticshap:11,drag:32,gdelayloadhook:4,overlapcapsuleshap:23,reinterpret_cast:[0,7],"16m":12,edebug:32,tirecontactpoint:14,nbconstraint:16,mechan:[14,0,16,17,18,19,20,21,7,8,29,23,25,11,27,12,30],unscal:12,physxcommon:[13,17],full:[7,8,9,26,11,30],refitbvh:26,getbinarymetadata:27,which:[20,7,29,25,11,12,14,17,0,1,23,26,30,4,8,9,16,18,19,6,21,22,3,24,27,28],pleas:[14,0,16,2,6,4,19,7,22,23,24,11,12,26,30,31],isoverlap:1,whose:[16,1,17,18,0,20,21,7,29,26,30],gcarpadsmoothingdata:30,rapid:[19,16,30],curv:13,setinvinertiascal:12,pipelin:[9,12,21,28],iscloth:6,geom1:[16,1],geom0:[16,1],"28tank":30,etank_wheel_3rd_from_front_right:14,getdata:26,physapi:25,stiffen:0,"new":[5,7,29,25,11,14,17,28,20,23,26,31,8,9,16,18,12,6,21,24,27,0],echannel_brake_control:[14,30],etank_wheel_8th_from_front_right:14,again:[16,2,6,4,19,20,7,29,25,26,18,12,30],pxinitvehiclesdk:[14,30],targetposit:[0,20],setdigitalsteerleft:30,reportassertviol:23,addbroadphaseregion:29,enotify_threshold_force_lost:[21,7],eestim:30,neg:[12,1,26,7,30],ewheel_omega:14,autowak:[14,16],occup:11,setsolveriterationcount:16,unphys:30,obstacl:[12,10],settireloadfilterdata:30,occur:[16,12,20,21,7,11,0,30],same:[20,7,29,25,11,12,14,17,3,1,23,26,30,32,8,9,16,18,19,6,21,24,0],pxfilterobjecttyp:29,magnitud:[14,16,18,0,7,30],analogu:27,lift:30,header:[10,17],implement:[14,17,1,13,9,4,19,20,6,7,23,25,29,24,0,11,18,27,12,26],basetask:13,drawwalldec:11,best:[10,7],pointfromuv:1,paramet:[20,7,17],pname:27,measur:[12,30,22],life:[16,30,7],cook:10,pivot:12,pxccdcontactmodifycallback:[23,7],pxvehicleupdatesinglevehicleandstoretelemetrydata:30,appear:[16,18,12,5,7,23,0,30],throughout:[30,20],pxgdynamicsmemoryconfig:9,introduc:[14,16,6,18,28,20,7,29,26,12,30],race:[0,30,21,3],jounc:30,edeleted_shape_0:6,emit:[9,26,7],bake:23,incomplet:13,jointacceler:20,inch:30,etg:16,interleav:[25,21],stronger:[23,30],support:[20,5,7,29,11,12,14,17,0,1,23,26,30,32,4,8,9,16,18,19,6,21,3,24,27,28],gather:12,emax_nb_drive_channel:14,adher:0,hfshape:26,coeffici:[23,16,20],unus:[4,12,25,27],schema:19,particularli:[16,0,29,25,26,30],mdampingratezerothrottleclutchengag:30,imbal:30,forward:[13,18,28,20,25,12],pxvisualdebuggerext:[14,25],pxsolvertyp:16,heightfielddesc:26,user:10,reliabl:[30,6],remot:[29,2,19],cab:25,customclassrepxseri:27,setdriveveloc:[0,20],summari:[30,22],ramp:18,echannel_steer_right_control:14,smaller:[14,18,12,7,26,11,30],can:[20,5,7,29,25,11,14,17,0,1,23,26,30,31,32,2,4,8,9,16,18,19,6,21,22,3,24,27,12],choreograph:30,oncontactmodifi:7,pxmeshoverlaputil:[14,6],unrespons:30,linearvelocityactor1:7,gap:26,practic:[20,7,10],pxrevolutejointcr:0,globalpos:[16,7],ricochet:7,"1500kg":30,lifetim:19,thin:[1,7],here:[14,0,16,1,30,4,19,20,7,29,25,11,18,27,28,26,12,32],larger:[14,16,12,20,7,26,0,30,31],externaldriveiter:20,desir:[16,18,12,7,29,24,25,11,0,26,30],px_define_typeinfo:27,std:[14,7],pxvehicleautogeneratedmetadataobjectnam:27,insid:[17,23,4,1,7,29,25,26,30],eenable_active_actor:16,convexshap:29,onshapehit:18,stl:23,immut:[29,16,9],notic:[0,15,30,7],batchqueri:30,pvdsdk:32,khz:30,closer:[11,31],snippetvehicle4w:13,node:0,getpvddatastream:14,getsweepqueryresultbuff:30,issleep:[14,23,16],pxspheregeometri:[14,29,23,26,19],raycastani:[14,6],left:[17,18,19,5,27,30],"\u215d":24,etransmit_contact:[14,32],post:[28,25,11,0,17],overal:[12,16,21,7],nearest:11,settireforceshaderdata:30,fidel:25,descript:[10,26,13],setqueryfilterdata:[30,11],playback:19,susptraveldirect:30,granular:21,auto:10,pxvehicledrive4wrawinputdata:30,pressur:11,right:[16,17,18,19,5,15,11,26,30],mmaxfilterednormalisedload:30,ebvh33:26,comfort:16,didn:19,pxsphericaljointflag:0,button:30,espher:[20,26],getconstraint:0,drop:[32,0,7,9,25,26,12,30],"enum":[14,16,17,18,6,29,24,11,26,30],acquir:21,pxpruningstructur:[12,10,13,6],charact:[20,10],push:[18,0,16,30,7],you:[14,17,16,25,3,9,4,6,20,7,21,22,29,23,0,11,18,28,26,12,32],durat:30,emax_num_title_char:14,penetr:[10,7],ekinemat:[29,16],registerbinarymetadatacallback:27,engag:30,longitudinaltireforc:30,stiff:[14,0,20,30],pxdefaultalloc:4,contactstream:14,swaptolowlodvers:30,edrive_model_speci:14,gettaskid:21,pxvehicleautogeneratedmetadataobject:27,setoverlaprecoverymodul:18,ralli:30,eenable_swept_integr:14,traction:30,distribut:[3,15,10,13,17],mmaxdroop:30,appguid:21,clean:[4,26,6],solut:[18,19,5,1,7,3,24,25,11,12,30,31],replac:[14,6,18,23,9,24,11,30],mcamberstiffnessperunitgrav:[14,30],hfgeom:[12,26],pxqueryhit:[14,6],adopt:30,nxwheelshap:30,contactcorrelationdist:14,pxvehicledrivetank:14,imaginari:20,chassisqryfilterdata:30,gettiredrivablesurfacecontactnorm:14,echannel_norm_tire_lat_forc:[14,30],dramat:16,convent:13,constructor:[14,17,23,11,27,30],conveni:[14,16,32,18,19,1,29,26],aaaabbbbccccdddd:12,runclang_:27,volumegrowth:18,nxphysic:23,predefin:[4,26,7],buffers:[1,11,7],"_aligned_malloc":[23,25,17],setkinematictarget:[0,16,25],end:[16,3,23,18,0,1,7,29,25,12,30],getsusplinestart:14,einvalid_oper:17,vanish:32,vnext:0,"60th":12,settargetorient:20,queri:[10,17],modern:8,collisionflag:18,xml:[14,19,3,27,23],hash_map:23,eplane_shift:26,mysimulationeventcallback:0,test:17,px_delet:14,add:[14,0,3,9,19,20,21,7,29,11,27,12,30],pxforcemod:16,along:[14,16,1,6,18,19,20,7,29,24,23,11,27,0,26,30],hover:30,mnbmateri:19,pxvec4:17,filtershad:[14,9,7,23],wobbl:16,triangleneighbour:26,heart:18,vanilla:30,pxserial:[14,19,30,27],plug:19,difficult:[18,9,23,30,12],remov:[13,17,18,12,20,10,26,0],exampl:[20,7,29,25,11,14,17,0,1,23,26,30,32,2,4,8,9,16,18,19,21,22,3,24,27,12],pxvehicledrivablesurfacetyp:30,hardcod:11,pxraycasthit:[14,30,1,11],bring:[8,0,30],pxbroadphaseregion:29,localpos:12,meshcontactmargin:16,fill:[16,18,19,1,11,26],pxserializerdefaultadapt:27,friend:27,ejounc:[14,30],usercallback:11,shapea:[19,23],xcm:30,pxclientbehaviorflag:14,registerdeletionlistenerobject:14,compound:[12,7],binari:[3,10,17],artist:30,aggress:30,erear_left:[14,30],unintuit:0,tag:[18,23],mbatchcomplet:14,earth:16,climb:10,wrap:21,edeprecated_trigger_trigger_report:13,followjoint:19,mean:[14,16,9,18,19,20,21,7,29,25,11,12,26,30],cookingparam:26,tirecontactnorm:14,turn:[16,13,9,3,21,7,8,29,25],implicitli:[14,19,0],setsleepthreshold:[23,16],fals:[14,6,18,28,1,29,26,27,3],etank_wheel_6th_from_front_right:14,rough:[16,30],pcm:10,jointveloc:20,uncoupl:30,"while":[14,0,16,30,2,22,18,19,5,20,7,21,8,29,23,25,11,28,26,12],cleanup:14,mfrictionvsslipgraph:30,sse2:26,ebroken:0,thereaft:[11,17],pxpvdinstrumentationflag:[4,12,32],anim:[18,28,25],topic:[0,16],deadlock:21,particl:[14,10,13],exert:16,resourc:[4,12,16,25,21],monitor:12,nxspringanddampereffector:23,fall:[16,32,9,18,20,29,26,30],mthread:14,pxsolveconstraint:16,tcp:32,eis_releas:27,normalrejectangl:30,einternal_contacts_are_flip:7,outsolverbodydata:16,settiredata:30,swing1:0,daili:3,emploi:[14,30,21],settireforceshaderfunct:30,px_def_bin_metadata_item:27,prefer:[14,2,17,4,12,21,25,26,3],gamepad:30,collision_flag_ground_against:30,few:[18,12,5,7,11,30],platforms_as_obstacl:18,control:[20,7,10,17],smoothli:[18,0,30,7],pxmat33:[14,17],command:[19,5,24,3],gettwistangl:24,htm:30,eas:19,proce:[26,25,11,30],advers:9,guserphysxtyp:27,ediscrete_respons:14,capsuledens:26,stutter:7,sethalfforwardext:18,textur:30,plagu:18,result:[14,17,16,1,13,3,9,18,6,20,7,29,23,10,26,28,0,12],context:[14,5,21,9,25,27,30],settypepairfrict:30,"export":[19,30,27],unknown:[18,23],ptype:27,pxplatform:6,unabl:30,pxsetprofilercallback:32,etransmit_constraint:[14,32],nbsprungmass:30,"60hz":30,gettiredrivablesurfaceshap:14,prefiltershad:[30,11],"\u215c":24,struct:[14,16,32,9,4,0,29,11,27,23],raycast:10,lookup:[23,30],getlinkindex:20,minimalist:5,substitut:[23,15],chapter:[0,16,17,4,28,1,21,22,12],stabil:[12,20,7,25,26,0,30],pierc:1,disabledpropertyentri:27,variat:[28,7,27,12],promot:[15,30],react:[30,25,32],spheric:[18,20,13,10,1],setgrav:16,outweigh:21,folder:[23,5,27,3],invest:17,contactrecord:16,setdrivetyp:20,maxoverlapsperexecut:14,rear:30,experienc:[14,18,30],echannel_normalized_tireload:14,mtorquecurv:30,dynamicstructur:11,customtypeautogeneratedmetadataobjectnam:27,per:[14,0,16,9,18,28,20,21,7,25,29,24,23,11,12,26,30],heavier:[0,7],prohibit:11,mbatch:14,troubleshoot:10,eowns_memori:27,trough:12,convert:[14,17,18,12,23,11,27,26],edeprecated_body_joint_group:24,magenta:18,pxsetphysxcommondelayloadhook:4,convers:[14,19,30,26,23],pxvehiclenw:30,setgearsdata:30,applycach:20,tirealignmo:30,windmil:16,determin:10,samplebas:[29,21],converg:[0,16],maxlesepar:30,emine_head:29,cluster:[29,11,21],iresult:14,levit:18,gbatchqueri:30,analogv:30,smoke:30,swap:[25,11],dimensionless:30,against:[17,18,12,7,9,26,27],pathtonewrepxfil:19,enotify_touch_lost:21,echannel_accel_control:[14,30],hamper:30,prefix:[14,23,17],channel:30,among:[14,16,0,5,25,30],vehicle4w:30,offset:[12,20,7],suspspringforc:14,enodr:14,softwar:[23,15],toi:[16,7],further:[14,16,1,9,19,6,7,21,29,25,11,26,30,31],getverticesformodif:26,too:[2,3,17,18,19,7,29,11,28,26,13],legal:[16,20,21,7,23,25,30],stiffer:30,solvertyp:[14,16],incallback:27,follow:[20,7,29,25,11,12,14,17,0,1,23,15,26,30,31,32,2,4,9,16,18,19,6,21,24,27,28],sap:[29,9],zonestart:32,pxmin:12,strictli:[18,30],custom:[10,7,17],rustl:11,successfulli:[14,19,25,32,30],fresh:30,nbtouch:[14,11],nbbatchhead:16,nbactiveactor:16,disableshapeinscenequerytest:29,press:30,stabl:[12,0,7,30,17],contactpoint:[7,6],pxscene:7,setdriv:[0,20],popular:[12,17],filenam:[19,25,32,27,17],tolerancelength:16,pxvehicledrive4wsmoothdigitalrawinputsandsetanaloginput:30,blue:[18,30,21],twenti:28,onconstraintbreak:[0,25],getsleepthreshold:16,pxactorclientbehaviorflag:14,improv:[14,16,12,21,7,11,0,26,30],customtypeextensionapi:27,eforc:16,python:24,affair:25,approxim:[14,16,17,12,1,7,9,26,30],exempt:29,nxcookinginterfac:23,directli:[14,0,16,30,2,17,4,19,20,7,21,18,24,23,11,12,26,22],inaccur:30,prep:9,getwheelshapemap:14,strength:[0,20,30],readili:30,concess:12,"final":20,drivesimdata4w:30,pxregisterheightfield:[4,26,6],pxvehiclesuspensionraycast:30,hadblockinghit:11,maco:19,revisit:[29,26],never:[13,18,12,20,21,29,11,3],straighten:30,jointposit:20,mfinalratio:30,pxsolverbodydata:16,pxheightfieldsampl:[12,26],pxarticulationreducedcoordin:20,ideal:[18,29,30,0],momentum:[8,0,30,7],scientif:30,setwheelshapemap:[14,30],knee:0,createserializationregistri:[14,19,30],instanc:[14,0,2,23,4,19,1,21,22,29,25,11,27,12,26,30],linear0:0,linear1:0,meshsizeperformancetradeoff:[12,24,26,6],getdebugrender:23,pxconstraintsolverprep:0,thephys:26,shall:[30,15,25,21],impos:[30,25],nxactor:23,width:13,ecollision_down:18,pxuserrefer:14,apexscen:21,difficulti:[0,30],bigger:[18,16,7],zangl:20,einequ:0,sphereshap:14,eventu:[29,16,30,21],cuboid:30,pxconstraintalloc:16,estat:[14,13],parentattach:20,conceptu:30,eexclude_kinematics_from_active_actor:16,vector:[20,10,17],createheightfield:[12,26,6],e9th_from_front_right:14,pound:30,pxvehiclecompon:30,restdist:7,velocitytarget:0,whole:[18,29,16,11,19],crt:3,structur:[16,1,30,23,12,20,21,7,29,24,25,11,27,0,26,22,31],lifo:21,registerobserv:14,pxcreaterepxobject:27,pxsweepqueryresult:30,overlaphitbuffers:14,offlin:[12,26],echannel_jounc:[14,30],nearbi:[18,11],amount:[0,16,2,9,18,28,1,7,22,29,23,25,26,12,30],numshap:29,gvehicleinputdata:30,invdt:16,provid:[2,17,18,12,20,7,9,27],"void":[14,16,32,17,18,19,23,21,7,29,25,11,27,0,26,30],gsteervsforwardspeedt:30,wheelraycastprefilt:30,challeng:[30,26],trade:[29,20,26],light:[0,16,30,7],all:[20,7,29,25,11,12,14,17,0,1,23,15,26,30,31,32,2,4,8,9,16,18,19,6,21,3,24,27,28],scalar:[20,7],ebend:14,pxoverlaphit:11,priorit:0,introduct:10,hadblock:11,pxpruningstructuretyp:[10,13,6],recurs:[18,25],getwheelcentreoffset:30,neighbor:26,pxvehiclesetupdatemod:30,maxnbdynamicshap:11,howstuffwork:30,ellipt:[0,20],beyond:[13,0,20,29,24,12],evolv:8,bullet:[1,11],wheelradiu:30,e3rd_from_front_right:14,adaptive_forc:23,linearmotionveloc:16,setprecisesweep:18,flip:[26,7],aid:[14,30],emax_num_wheel_channel:[14,30],preparedata:0,eimpuls:16,exportnam:19,demonst:30,damper:30,org:30,highpeakpress:30,taskmanag:[23,10,13],userraycasthitbuff:14,extract:10,evolut:18,getmass:30,tricount:26,myrenderobject:16,audio:[8,30],comment:16,mrearleftrightsplit:30,corrupt:21,surface_type_tarmac:30,echannel_steer_control:30,cycl:[0,30],eenable_two_directional_frict:14,mnbthread:21,readallproperti:27,pxcudacontextmanagerdesc:[9,21],arithmet:[8,9],pxtrianglemeshgeometri:[29,26],skin:[18,13],swaptohighlowvers:30,techniqu:[30,20],px_max_flt:0,danger:30,explicit:[14,6,20,21,8,23,24],mmaxnormalisedload:30,maxdist:[1,11],higher:[12,20,3],delai:[10,13],sprung:30,"__line__":[25,17],mod:0,enorm_tire_long_forc:14,virtual:[14,16,32,17,4,28,21,23,25,11,27,0],moi:30,bb2w:0,px_delete_repx_seri:27,createcustomclass:27,kinet:16,abil:[0,30],valuabl:[30,7],role:[29,30,0],"0xffffffff":[23,1],dimension:[8,16],roli:16,outorderedconstraintdesc:16,inread:27,fly:[18,3],overli:30,pxcreatecudacontextmanag:[9,21],defici:23,pxconcretetyp:[19,27],pxrepxobject:27,getanyhit:14,cudacontextmanagerdesc:[9,21],overwritten:27,mestimateiter:30,unnecessari:[21,27],zero:[14,16,1,17,18,0,20,21,7,23,11,26,30,31],glean:29,userdata:[14,16,17,18,19,7,11,30],myactor:[29,23,26,17],distinct:[0,25,30],pxconstraintbatchhead:16,invinertiascal:[16,7],pxarticulationjointtyp:20,ecommit_enabled_build_en:11,"break":[14,16,17,19,20,7,25,11,0,26,30],profound:30,pxdominancegroup:16,snippetcustomprofil:32,centreofmass:30,flushscenequeryupd:11,typeid:0,pxarticulationmot:20,pxhitcallback:[23,11],trg:30,ca2w:20,isblock:14,vanishingli:30,enorm:[1,11],each:[20,7,29,25,11,12,14,17,3,1,23,26,30,8,9,28,16,18,19,6,21,0],wider:[16,30],gave:19,varieti:[0,30],colscal:26,isarticulationlink:6,geometri:[12,10,7,3,17],nbbodi:16,geompos:1,edynamic_aabb_tre:13,undocu:17,shapepos:11,reentrant:7,getactivetransform:24,pxfindfaceindex:1,trimeshid:19,pxpvd:[4,32,6],"try":[18,12,21,22,0,30],obligatori:30,nxrai:23,accumul:[28,16,30,11],cylind:0,actuat:[8,20,13],unregisterrepxseri:27,statickinefilteringmod:[16,7],motor:[0,20],idx:27,suspdata:30,pxdefaultmemoryoutputstream:26,amplitud:30,pick:[30,11],trail:18,amaterialarrai:26,anoth:[14,0,16,17,18,12,1,21,29,23,25,26,27,19,30],ekeepbia:0,pool:[12,25,21],setwheelcentreoffset:30,etire_lat_slip:14,gdefaulterrorcallback:4,unregistercustomclassbinaryseri:27,pairwis:[23,7],edisable_preprocess:0,physxmetadata:27,grant:21,evenli:30,emax_nb_title_char:[14,30],processtouchinghit:14,setnam:19,nxstream:23,optim:[0,32,3,23,19,1,21,7,29,11,12,30],setheight:18,echannel_handbrake_control:[14,30],echannel_gear_ratio:[14,30],solitari:29,generate_project:[3,24],piec:[18,26],read:[14,16,32,4,28,21,7,25,27],gl_modelview:17,bvhdesc:11,dofstart:20,wheeltodis:30,rememb:[18,19,30],pxshape:[14,16,17,18,12,1,7,29,24,23,11,19,26,30],pxaggreg:29,createscen:[9,23,21,7],deepest:29,squish:16,ought:30,ride:[18,30],subsequ:[14,16,18,12,21,7,23,25,11,0,30],goal:[12,30,21,7],slide:[18,0,20,7],classtyp:21,host:9,regim:30,contigu:30,setdrivetorqu:30,echannel_tire_frict:[14,30],dynamictreerebuildratehint:[12,11],manufactur:30,part:[17,3,20,7,10,12],hole:[26,7],hold:[14,16,18,19,1,21,7,23,0],validatetrianglemesh:26,draw:17,eremoved_shape_0:[21,6],eremoved_shape_1:6,reveal:30,minpositionit:16,px_serial_ref_kind_pxbas:27,outbatchhead:16,appendix:30,reach:[16,17,18,12,21,7,29,25,26,0,30],pxactorclientbehaviorbit:14,drivetargetangl:20,pxvehicle4w:30,resolverefer:27,keyboard:30,doc:30,maxraycastqueri:14,pars:[7,6],dof:[0,20],live:19,centric:11,pdf:30,getboxgeometri:26,pxjointlinearlimitpair:14,don:30,efront_right:[14,30],getconcretetypenam:[27,17],pxfilterobjectistrigg:[29,23],actual:[16,4,19,18,12,30],eproject:0,method:[20,7,29,25,11,14,17,28,1,23,26,30,31,32,4,16,19,6,21,22,27,12],differenti:13,pxactortypeflag:[14,6],servic:15,assort:23,pxvisualdebuggerconnect:32,suit:[19,20],vertex:[9,10,12],attach:[14,17,19,20,7,29,11,0,30],handleoverflow:14,pxconstructstaticsolverbodi:16,mtype:30,pxfilter:29,lag:0,readbuff:26,etouch:[14,11],system:[10,7,17],createfrictionpair:30,linearvelocityactor0:7,eptr:27,interrupt:[29,15],eremoved_shape_oth:29,childpos:20,addcollisioncapsul:14,even:[14,0,16,1,2,17,18,19,20,7,21,8,29,23,25,11,12,30,31,15],releas:[10,7,17],append:27,possibl:[20,7,29,25,11,12,14,0,1,23,15,26,30,4,9,28,16,18,19,21,22,3],purposefulli:30,touch:[14,16,13,18,12,1,7,29,10,26],pxcreatecook:[4,23],ball:[0,5,20],simfilterdata:30,scheme:[16,23,12,7,29,24,30],pxmodifiablecontact:7,contribut:[18,0],susplinestart:14,pxconstraintinvmassscal:0,pxvehicl:27,overlapsphereshap:23,"125f":30,createcollectionfromxml:[14,19],seem:[0,30,31],tensor:[12,16,26,23],somewhat:[0,17],parameter:30,settwistlimit:20,onconstraintreleas:0,x87:17,pxconstraintsolverbodi:16,convexmemory128:19,nx_timestep_vari:23,pxscenequeryfilterdata:[14,6],gpudynamicsconfig:9,npc:18,stall:21,robot:[8,3,20],regist:[14,16,32,6,4,25,26,27],vehicleupdatemod:30,einitial_overlap:14,pxheightfieldmateri:26,createconstraint:0,defaultmateri:[29,30],sync:[14,18,12,7,9,11],faceindex:[1,11],noth:[16,6],onsleep:[16,25,21],note:[20,7,29,25,11,12,14,17,0,1,23,26,30,31,2,4,8,16,18,19,21,27,3],isparticlesystem:6,gcc:24,relax:30,project:10,ba2w:0,esubmarin:29,pxcudacontextmanag:24,wildli:30,e2nd_from_front_left:14,pxtriggerpairflag:[29,21,6],assum:[0,16,19,20,21,7,29,11,12,26,30],xmlwriter:27,setmot:[0,20],impulsivetorqu:16,px_physx_core_api:16,fine:[14,12,30,18],histori:[19,30],eenable_one_directional_frict:14,find:[16,3,17,12,1,7,29,11,27,19,26],ccd:10,addforc:[14,16,25],latslip:30,pxvehicledrive4wsmoothanalograwinputsandsetanaloginput:[14,30],geometryqueri:11,trimesh:19,pxrigiddynam:[14,0,16,13,17,18,28,23,7,19,29,24,11,27,12,26],deletionev:14,horizont:[18,30],highlimit:20,ontrigg:[29,25,21],eprismat:20,eenable_ccd:[14,7],indefinit:[12,16],refcount:21,pxbatchqueri:[14,30,11],classic:[8,16],"byte":[25,19,23,17,12],reoder:12,myob:14,setswinglimit:20,restitutionmodifi:0,dispatch:[9,21,12],pxconstraintinfo:0,actora:[23,16],wheelsdyndata:30,actorb:[23,16],efre:0,instantiate4wvers:30,actormemory128:19,setparentpos:20,pxvehicleupdatemod:30,geometr:[14,0,23,26,11,30],onli:[2,17,18,12,20,7,9,27,3],disguis:30,writedata:27,getpvdconnect:14,pxvehicewheelssimdata:30,pxvehiclekeysmoothingdata:30,real:[16,32,18,19,20,7,8,26,28,30],pxbatchquerymemori:[14,11],jointdesc:16,mesh:[10,17],pxcollectforexportscen:14,room:14,fulfil:[30,20],repeatedli:1,aabbtre:11,root:[0,5,20,27,3],pxclosevehiclesdk:[14,30],pxsweepbuff:[14,11],pxvehiclegraphtyp:14,miss:[12,30,7,27,23],likewis:14,rapidli:30,createexclusiveshap:[17,16,6,19,20,29,24,23,12,26,30],surfacetyp:30,safeti:[14,30,1],anmim:25,actor1:[14,0],pretti:[30,31],"static":[14,0,16,3,23,4,28,7,8,29,24,11,18,27,12,26],focu:30,portion:[12,30,7,23],readabl:19,mnbvert:26,circl:[30,26],ask:[8,29,7],cull:[12,1,11,2],pxserialobjectref:14,runonspu:14,oncontact:[29,25,21,7],profilerdata:32,mwidth:30,getphysxcommondllnam:[4,24],ediff_type_ls_rearwd:30,max_num_surface_typ:30,pxqueryreport:14,getlinearveloc:[0,16],setu32collisionflag:23,echannel_engine_rev:[14,30],gram:30,cmake:10,cours:[8,30],seidel:[8,10,13],artefact:[12,7],watertight:7,wheelidx:30,layout:[19,30,26,17],pxbvhstructur:[10,13],believ:[8,30],els:[14,23,16,30,11],take:[14,0,9,16,30,6,18,19,17,25,7,21,8,29,24,23,11,28,26,12],physxcommon_x64_renam:4,carsim:30,patchstreamcapac:9,eventnam:32,pxserialz:14,sweep:[18,12,10,26,7],mcrabbodi:29,enter:[29,30],steep:[18,13],physxcommon_:3,someth:[18,30,25,26,32],ultim:16,sequenti:[30,25],px_float_point_precise_math:3,rubberi:0,setgeometri:26,maneuv:30,exclud:[16,11],commit:[11,17],spatial:[17,0,6,29,11,30],buf:[14,26,11],bug:[8,12],px_buildsnippet:3,getpvdconnectionfactori:14,idl:[18,28],gravitationalacceler:30,egpu:9,mydellisten:14,iqueri:14,submit:[14,23,25,21],exclus:[19,21,24,27,0,12],msprungmass:30,onreleas:14,triangul:26,pxqueryhittyp:[14,30,11],"01f":[0,1,30],enotify_touch_found:[29,24,7],mcamberatmaxdroop:30,could:[16,1,23,18,19,20,7,22,29,11,27,26,30],skeleton:23,setupnondrivablesurfac:30,edg:[14,18,12,7,26,0,30],wheelcentrecmoffset:30,slerpdriv:0,registercustomclassbinaryseri:27,occasion:[11,21],pxconvexmeshgeometri:[29,30,26,17],toward:[18,0,20,26,30,31],unlik:[16,1,23,20,7,8,9],fuso:30,drawback:7,setsusptraveldirect:30,sdkroot:17,surfacemateri:30,almost:[8,29,30,21,32],e5th_from_front_right:14,rest:[0,30,26,21,7],knock:16,server:32,x64:24,invis:10,setactorflag:16,name:[17,18,28,6,15,8,29,24,23,27,12,26],boxshap:14,allevi:12,spawn:[18,29],dictat:18,inconveni:16,clock:30,camberangleatmaxcompress:30,quantit:22,perspect:21,mbuf:27,failur:[3,16,21]},filenames:["Manual/Joints","Manual/GeometryQueries","Manual/DebugVisualization","Manual/BuildingWithPhysX","Manual/Startup","Manual/HelloWorld","Manual/MigrationTo34","Manual/AdvancedCollisionDetection","Manual/Introduction","Manual/GPURigidBodies","Index","Manual/SceneQueries","Manual/BestPractices","Manual/Index","Manual/MigrationTo33","Manual/License","Manual/RigidBodyDynamics","Manual/API","Manual/CharacterControllers","Manual/Serialization","Manual/Articulations","Manual/Threading","Manual/Statistics","Manual/MigrationFrom28","Manual/MigrationTo40","Manual/Simulation","Manual/Geometry","Manual/ExtendingSerialization","Manual/RigidBodyOverview","Manual/RigidBodyCollision","Manual/Vehicles","Manual/OriginShift","Manual/VisualDebugger"],objnames:{},titles:["Joints","Geometry Queries","Debug Visualization","Building with PhysX","Startup and Shutdown","Snippets","Migrating From PhysX SDK 3.3 to 3.4","Advanced Collision Detection","Welcome to PhysX","GPU Rigid Bodies","NVIDIA PhysX SDK 4.0 Documentation","Scene Queries","Best Practices Guide","User's Guide","Migrating From PhysX SDK 3.2 to 3.3","PhysX License","Rigid Body Dynamics","The PhysX API","Character Controllers","Serialization","Articulations","Threading","Simulation Statistics","Migrating From PhysX SDK 2.x to 3.x","Migrating From PhysX SDK 3.4 to 4.0","Simulation","Geometry","Extending Serialization","Rigid Body Overview","Rigid Body Collision","Vehicles","Scene Origin","PhysX Visual Debugger (PVD)"],objtypes:{},envversion:43})
\ No newline at end of file
+Search.setIndex({terms:{"final":28,pxconvexmeshcookingresult:5,vc14win64:[19,16,1],"return":[0,18,20,16,2,22,3,4,5,6,7,26,9,10,11,13,29,30,27,31,8],similarli:[0,20,22,3,4,6,28,8,27,11,31,26],applic:25,although:[19,22,3,28,26,11,12,30,31,29],serv:[15,26],happen:[0,18,20,3,5,8,11,12,29,30,31,26],autobox:3,sse:7,factori:2,pxactorclientbehaviorbit:4,enotify_threshold_force_found:31,page:21,getvertic:5,focusvehicl:3,bool:[0,27,20,2,22,3,4,5,26,9,10,11,29,8],book:3,precis:[30,5,1,8],wheel:[4,17,22],express:[3,32,12,8],profound:3,meshpos:20,massscal:31,solverbodydata:29,markdirti:22,zangl:28,unbreak:22,eowns_memori:2,failur:[6,1,29],crack:31,sqhitbuff:3,arrai:[20,22,3,4,5,26,28,12,30,31,29],reporterror:[0,7],humanoid:28,done:[2,5,6,28,8,9,11,12,30,26,24],explos:26,explor:[19,24],terraindesc:30,gettiredrivablesurfacetyp:4,physxcommon_x64_renam:10,parent:[11,28,22],massmatrix:28,push:[3,31,29,22,8],getpvdconnect:4,shortest:[28,22],unregisterseri:2,pxconstraintsolverbodi:29,explod:31,wheelradiu:3,physxgpu_:1,regard:[0,6],offlin:[30,5],consult:[3,11],refus:17,centimet:[3,31,7],slope:[17,8],pathto30repxfil:12,staticstructur:26,direct:[20,32,22,3,5,6,8,27,28,30,31,26],bug:[30,21],rigidactor:7,addcollisioncapsul:4,ecrab:11,unfortun:[3,27,11,29,8],locost7:3,sdkroot:7,conceiv:3,pxfindoverlaptrianglemeshutil:[4,13],filtershaderdata:[27,26],same:[3,4,5,6,13,8,15,26,28,30,31,0,1,7,11,12,16,18,20,21,22,29,27],prior:[3,4,7,8,27,28,30,32],overlaphitbuffers:4,pxgeometryqueri:[5,25],discard:[27,15,22,26],condit:[20,1,22,6,12,30,31,32],minimalist:19,pxarticulationcach:25,interrupt:[11,32],mmaxdroop:3,hasnextpatch:31,nxrai:27,etire_long_slip:4,groups2:27,guidelin:17,expos:[3,27,20,28,7],hand:[18,19,22,3,7,12],identifi:7,flippedcontact:31,necessarili:[31,8],sample_fre:11,readallproperti:2,cull:[30,20,14,26],obstacl:[30,25],obvious:[3,8],advancecoeffici:31,ship:27,size_t:[18,12,2,7],setrigidbodyflag:[4,31,29],classtyp:6,drift:[28,22],down:[25,13,8,7,31,17],overlapsphereshap:27,has16bittriangleindic:5,digit:3,nxcontactpairflag:27,pxsimulationfiltershad:11,mani:[18,27,22,3,4,6,7,8,9,10,11,12,30,31,26,24],stair:8,eclutch_slip:4,computegeneralizedmassmatrix:28,area:[23,5,6,8,14,26],subtl:31,pxclothcollisiondata:4,toolkit:5,larg:[27,20,21,22,3,4,5,28,7,8,9,10,11,12,14,29,30,15,26,24,31],howev:[18,20,21,22,3,4,5,6,28,7,8,27,23,11,12,14,30,15,29,31,32],distribut:[17,32,1,25,7],attempt:[20,22,3,6,8,11,30,15,31],snippethelloworld:25,limitpair:22,attach:[22,3,4,28,7,26,11,12,31],invok:[3,27,8,26],pendulum:3,eremoved_shape_trigg:[11,6,13],pxdefaulterrorcallback:[10,7],win:[16,1,22],corpor:32,collisiongroup:27,gener:[30,28,25,31,7],standard:[18,22,3,6,7,28],getnbcontactblocksus:30,you:[0,18,1,22,4,5,6,28,7,26,9,10,11,13,29,30,27,15,8,24,31],eenable_active_actor:29,roll_bar:3,isparticlesystem:13,pretti:[3,23],jacobian:28,pxvehicledrivedyndata:3,mine:[11,31],mind:[3,24,22],inner:[3,30],thephys:5,nxgetcookinglib:27,tether:4,updat:[30,28,25,31],overlap:[31,25],movement:[20,11,29,28,30,8],best:[31,25],want:[22,5,6,7,26,27,11,30,31,29],fundament:[9,27,11,22],setconstraintflag:22,pxmeshgeometryflag:[20,5],pleas:[22,3,4,5,23,13,26,27,10,12,14,29,30,31,16,24],fluid:27,messag:[0,18,19,4,7,11,15],pxcontrollerfiltercallback:8,persist:25,efre:22,hadinitialoverlap:[4,20],product:[20,21,3,6,14,32],getsuspensionforc:[3,4],gphysic:[3,15,26],flushqueryupd:26,repeat:[3,30],reorgan:4,templat:[27,2,13],how:[27,20,21,2,22,4,5,28,13,8,9,11,30,15,29,17,31],largest:8,etransmit_scenequeri:0,linkpos:28,inalloc:2,polyhedron:5,ask:[11,21,31],unnatur:[17,8],alignextradata:2,setwheelrotationspe:3,sum:[3,23,28,31],removerefer:[18,6],nbwheelqueryresult:4,mtoeangl:3,mcrabbodi:11,experi:[3,30],screenshot:[19,8],sub:[3,18,11,27,30],equickhul:13,scalabl:21,namespac:[0,4,2],were:[18,27,20,2,3,4,5,6,28,26,9,11,12,31,29],pxarticulationjointreducedcoordin:[28,16],serializ:[4,17],substitut:[27,32],pxplanegeometri:5,peculiar:29,coher:[3,6,26],demand:6,parentpos:28,prioriti:[27,6,22],getnbwheel:4,cach:[28,25],ispenetr:20,pxcreatebasephys:[10,5],eplane_shift:5,ccdfiltertest:4,valu:[30,28,31,7],good:[0,19,20,21,22,3,28,32,8,11,30,26],einvalid_paramet:7,fold:4,subsequ:[18,22,3,4,6,8,27,29,30,31,26],elbow:22,softwar:[27,32],userprofilercallback:0,debug:[25,7],cycl:[3,22],pxvehicle4wenable3wdeltamod:[3,4],confus:[4,8],wheelmoi:3,frictiontyp:4,flatter:3,manipul:[27,6,29],rowscal:[30,5],mbatch:4,halfheight:[27,5,8],drivabl:17,heightsampl:30,pxps3config:4,nxcapsuleshapedesc:27,pxcontrollerbehaviorflag:8,routin:[27,5,29],eprevent_climbing_and_force_slid:8,pxcontactmodifypair:31,drivetargetangl:28,gamepad:3,advantag:[18,21,3,4,6,8,9,28,30,15,29,31],hopefulli:3,pxrigidbodi:[4,26,29,9,30,31,16],flatten:31,edrive_limits_are_forc:22,"export":[3,12,2],quantiti:[20,22],phxsx:12,suffix:1,island:[30,31,29],troubleshoot:25,licens:[25,17],pxtoler:7,imag:19,miscellan:15,nbmateri:[30,5],dist:20,pxjointlimitpair:22,findappobjectfromid:12,shdfnd:4,pxscenequeryimpacthit:4,pxraycastqueryresult:[3,4],exist:[0,20,21,22,3,4,7,8,27,11,30,31,26],enon:[17,22],vice:[30,11],hierarchi:[5,17,7],echannel_steer_right_control:4,ehas_drive_force_limit:22,nbcontact:31,overlapshap:26,cast:25,emax_num_engine_channel:[3,4],graph:[3,18,22,7],setdrivenwheel:3,rather:[18,20,1,22,3,4,13,8,9,28,30,15,26,24],thread:[31,25,7],iqueri:4,"case":[30,28,25,31],customtypemetadataobject:2,mediat:3,scanforobstacl:6,bigger:[31,29,8],eall:[0,10,28],getdebugrender:27,appar:31,bqthread:4,dynamicstructur:26,processor:11,msprungmass:3,eequal:22,puttosleep:[4,29],solvehint:22,emax_nb_title_char:[3,4],flavor:[3,29],snippetvehiclenodr:17,closestpoint:20,einflation_incremental_hul:[16,13],startindex:20,removeobstacl:8,cudacontextmanagerdesc:[15,6],level:[19,20,1,22,4,5,7,8,27,28,30,31,29,17],pxcontrollerhit:4,frustrat:31,momentum:[3,31,21,22],speak:[30,11,8,26],recogn:7,getshap:[11,5],spu:[27,25,17],idbuff:12,gettirefrict:[3,4],those:[18,28,2,22,3,5,6,23,7,8,10,11,15,29,31],decad:8,reduct:22,coefficientmatrix:28,remark:[3,4],emax_nb_drive4w_analog_input:4,nthread:4,pxpvdsceneflag:0,press:3,stuck:[3,30,8],pure:[3,22],getcoefficentmatrixs:28,pxrigidbodyext:[4,5,29,27,28,30],maxraycastsperexecut:4,snippetraycastccd:31,struct:[0,2,22,4,26,27,10,11,15,29],commoninit:28,filtergroup:11,mstrength:3,tiresurfacetyp:4,setphasesolverconfig:4,onreleas:4,offset:[30,28,31],gettolerancesscal:[15,6],mpvd:[0,10],python:16,customtypepropertyinfonam:2,pxconstraintinvmassscal:22,pxgetphys:12,einitial_overlap_keep:4,substanti:[4,22],tireforceshaderfn:3,nbcolumn:[30,5],circl:[3,5],gsteervsforwardspeeddata:3,subtract:[23,30],dll:[21,1,25,15,16,17],ongo:18,depend:[0,18,20,16,1,2,22,3,4,5,6,28,7,26,10,11,12,29,30,31,8],surfacetirepair:3,loss:[3,32],reconnect:17,wall:[31,25],lost:[0,4,5,29,12,15,31],pxclothstretchconfig:4,tradit:31,elev:[3,8],shaperenderactor:5,penal:3,epost_solver_veloc:31,computevelocitydeltafromimpuls:[31,29],maxraycasthit:4,jitter:31,jump:8,weak:3,edisable_active_edges_precomput:[30,5],escal:14,gcarpadsmoothingdata:3,wors:[28,5,6],minresponsethreshold:22,worri:31,getnbcolumn:5,einternal_has_impuls:31,likewis:4,recipwheelradiu:3,work:[18,27,20,16,1,22,3,4,6,28,7,8,9,10,11,13,29,30,31,26,24],issubordin:2,pxplane:7,dereferenc:11,word:[27,31,8,7,26],safe:[18,22,3,6,7,8,11,30,31],buf:[4,5,26],createheightfield:[30,5,13],collid:[18,3,5,29,27,11,31,8],usercallback:26,along:[20,16,2,22,3,4,5,28,13,8,27,11,12,29,31,26],primari:[18,19,1,28,12,31],parallel:[18,27,22,3,4,6,29,9,11,30],horsepow:3,seamin:31,camber:3,cookconvexmesh:5,configur:[30,28,25,31,7],recordmemoryalloc:10,side:[18,19,20,3,5,25,7,29,27,12,15,17],consid:31,idiom:7,"const":[2,3,4,5,6,13,8,10,14,26,28,30,31,0,7,11,18,20,22,23,29,27],pxscenequeryfilterdata:[4,13],pxrigidbodyflag:[4,31,29],bend:4,unpos:3,projecttoa:22,createcollectionfromxml:[4,12],getactivescen:4,internaldriveiter:28,address:[23,27,2,8],echannel_handbrake_control:[3,4],othercompletiontask:18,pxsolverbodi:29,power:[3,28],cloth:[27,25,17],sink:[31,5],funtion:4,addclient:0,touchdist:26,mytrimesh:5,impact:[0,20,22,4,8,11,14,29,30,31,26],densiti:[3,29,5,27,26],someth:[0,3,5,18,8],eremoved_actor_1:13,eremoved_actor_0:[6,13],modern:21,bodi:[28,25,31,7],off:[1,22,3,5,8,11,31,29],ith:5,scenedesc:[15,6,31],smp:30,lack:[3,28,8],pxsweepbuff:[4,26],batch:[30,25,17],emax_nb_vehicle_typ:4,shell:[20,29],surfac:[20,22,5,27,28,30,31,16,17],uservalue1:30,uservalue0:30,heavier:[31,22],unregisterrepxseri:2,emine_link:11,flip:[31,5],decrement:[3,6,13,7,29],upward:[3,8],settireforceshaderfunct:3,iskindof:[2,7],gtirefrictionmultipli:3,plugin:22,"int":[18,4,7],pxfixedsizelookupt:3,econtact_default:[3,27,11],mlatstiffi:3,user_releas:13,px_float_point_precise_math:1,constrain:[22,3,29,10,28,12,31,8],aspect:[18,30,5,6,2],handler:[0,7],list:[0,18,20,3,4,29,27,11,12,32],inf:[20,26],settessel:8,pull:[3,22],pxgpuloadhook:[16,17],ekil:[30,11],pxmeshqueri:[5,25],createreducedcoordinatearticul:28,pxvisualdebugg:[4,13],easier:[3,10,22,13,30],clientid:4,wheelcentrecmoffset:3,removeaggreg:11,makefil:[16,1],pxrigidbodydata:29,px_delete_repx_seri:2,remain:[20,22,3,5,28,7,8,11,29,31,26],pxi16:30,museautogear:3,troublesom:8,convex:31,onc:[18,3,6,13,8,28,12,29,30,31,26],pxvehiclewheelssimdta:3,room:4,introduc:[3,4,5,28,13,8,9,11,29,30,31,16],pxtolerancesscal:[0,3,5,7,10,31,16],sensit:[9,11,7],pxarticulationflag:28,each:[3,4,5,6,13,8,9,15,26,28,30,31,1,7,11,12,18,20,21,22,29,27],context:[18,19,2,3,4,6,15],physxmetadatagener:2,localframe1:22,guid:25,favor:[27,28,6],desc10row:5,pxregisterarticul:10,pointdist:20,pxcontactpairhead:[18,31],globalpos:[31,29],outward:[3,20],resist:[3,28,22,29],createrenderobjectfromshap:5,simplest:[18,19,3,26,9,11],standalon:[16,5,13],sens:[18,20,22,3,7,12,31],signifi:22,idea:[3,30],variat:[9,30,31,2],etwist:[28,22],mbatchcomplet:4,edetect_ccd_contact:31,nasti:22,pxpairflag:[3,4,6,27,11,31,16,24],domain:21,hfactor:5,sample_alloc:11,pointrejectangl:3,edeprecated_32_compat:13,mcamberatrest:3,invmassscal:[31,22,29],createmateri:[12,29],tessel:25,fetchresultsfinish:18,x64:16,penetr:[31,25],symmetr:27,dyn:[4,29],suspforceappcmoffset:3,dispatch:[30,15,6],eenable_one_directional_frict:4,xml:[12,4,27,1,2],repxupgrad:4,gpadsmoothingdata:3,applycach:28,setglobalpos:29,benefit:[18,15,21,6,22],adaptive_forc:27,invisiblewallheight:8,modul:[30,25],egpu:15,registerdeletionlisten:4,far:[18,20,22,3,5,13,23,11],slowli:[17,31,12,8],customtypeautogeneratedmetadataobjectnam:2,gavoidedphysxtyp:2,skill:3,nxsphereshap:27,createcach:28,background:[6,12],vc140:[16,1],mthread:4,emax_num_vehicle_typ:4,recov:[30,12,8],userraycastresultbuff:[3,4],analogu:2,busi:32,pxfilter:11,pxsolvercontactdesc:29,liabl:32,travel:[3,8],promot:[3,32],wheelcenteractoroffset:3,action:[3,11,8],generateexamplefil:12,inward:3,matter:[3,8],echannel_accel_control:[3,4],halfext:4,cell:[5,26],mfrontrearsplit:3,pxvehicle4wenable3wtadpolemod:[3,4],underw:4,px_serial_ref_kind_pxbas:2,rectangular:[30,5],end:[18,20,1,22,3,28,8,27,11,30,31,29],createexclusiveshap:[3,5,28,13,29,27,11,12,7,30,16],startup:[19,5,17,25],getrigidactorshapelocalboundslist:26,restrict:[18,20,3,6,28,8,27,11,29],incorpor:22,echannel_tireload:[3,4],gather:30,stiff:[3,4,28,22],echannel_norm_tire_lat_forc:[3,4],othershap:11,einvalid_oper:7,pxscenequerycach:[4,13],win64:19,servic:32,hasnextcontact:31,pxarticulationjointtyp:28,effector:28,blade:29,physxcooking_:1,pxheightfieldflag:[30,5],pxdefaultmemoryinputdata:5,slide:[28,31,22,8],current:[2,3,5,6,8,9,10,15,26,28,30,31,0,1,7,11,12,24,18,21,22,29],version:[19,16,1,4,5,7,8,27,21,13,30,31,26],further:[18,20,3,4,5,6,13,26,23,11,12,15,29,31],awar:[20,3,6,8,27,30],alon:[18,30,21],wake:[18,4,16,22,29],data:[25,7],awai:[22,23,5,29,31,8],settargetorient:28,awak:[22,29],untest:20,patchstreamcapac:15,descriptor:[27,4,5,2,8],getupdirect:8,removeactor:29,pxmetadataflag:2,pxactivetransform:16,test:7,nx_timestep_vari:27,pxrigidstat:[9,30,29,22,26],acquir:6,arm:22,earlier:[20,3,4,5,28,30],spin:31,pre:[18,2,4,5,7,26,27,12,16],roli:29,ccdthreshold:31,std:[4,31],pin:15,code:[2,4,5,8,9,15,26,28,30,31,1,7,11,16,17,19,20,21,22,25,29,27,32],pxactor:[27,4,7,8,9,11,12,13,31,29],stl:27,discret:[21,4,8,9,15,31],pxvehicleextensionapi:2,half:[20,5],system:[28,25,31,7],expir:8,crash:[30,12,11,1,6],enabl:[0,19,20,21,22,3,4,5,28,7,8,27,10,11,12,13,14,30,15,29,31],reject:[3,11,5],pxbvhstructur:[25,17],pxhitbuff:[27,26],halv:5,susplinelength:4,wheelconvexmesh:3,show:[20,2,22,3,4,5,6,29,9,11,12,30],freeli:[3,22,8],nx_assert:27,etank_wheel_7th_from_front_right:4,lockclothreaddata:4,pxdeletionlisten:4,pxgeometri:[20,5,7,29,27,11,26],answer:0,shadow:20,disguis:3,ekinemat:[11,29],targetveloc:22,tirelongitudinaldir:4,px_def_bin_metadata_extra_item:2,longitudinaltireforc:3,srcwheel:3,unregistercustomclassrepxseri:2,pxrigiddynamiclockflag:29,pxheightfieldgeometri:[30,5],embed:2,twice:[3,12,26],grow:[3,15],mactor:2,"50m":30,x86_32:1,onoriginshift:22,dai:1,portion:[3,27,31,30],getnbcontrol:8,defin:[20,21,2,5,25,7,8,28,14,15,17],pxvehicledifferential4wdata:17,global:[22,3,4,5,8,27,10,11,14,29,30,15,26],interpol:[3,30,22],pxsimulationstatist:24,createscen:[27,15,6,31],tirecontactactor:4,contactstreamcapac:15,platforms_as_obstacl:8,contactoffset:[27,31,8],pxraycasthit:[3,4,20,26],refilt:27,pars:[31,13],pvdclient:17,need:[2,3,4,5,6,13,8,10,26,28,30,31,0,1,7,11,12,16,18,20,21,22,29,27],"default":[2,3,4,5,6,13,8,10,15,26,28,30,31,0,1,7,11,12,16,20,21,22,29,27],swing2:28,getrigiddynamicactor:3,pxconstraintconnector:[27,22],climb:25,due:[20,21,22,3,8,23,12,29,30,31,26],caution:[3,30],adjust:[22,3,4,7,23,30,15,31],customclassrepxseri:2,singl:[19,20,22,4,5,25,28,13,27,11,7,30,15,17,31],startaft:6,maxnbcontactdatablock:[18,30],terrainwidth:30,nx_bounce_threshold:27,permit:[18,32,22,3,6,13,29,30,31,16],numchassismesh:3,drive:[31,25],planetoid:8,pxrigiddynam:[27,16,2,22,4,5,7,8,9,11,12,29,30,31,26,17],setvisualizationparamet:[22,14],pxclothfabr:4,neg:[3,30,20,5,31],window:[18,20,1,2,7,26,27,12,15,16],across:[3,12,26],settorestst:3,entir:[1,22,3,5,6,26,27,28,12,30,15,29,31],analysi:[3,1],constraintbuffercapac:15,center:[20,22,3,5,29,30,31,8],esteer_right_control:4,"new":[2,4,5,6,13,8,9,15,26,28,30,31,7,11,16,18,19,21,22,23,29,27],synchron:[30,17,12,25],eventnam:0,eventu:[3,11,6,29],e5th_from_front_right:4,wrong:[3,30],oncontactmodifi:31,imbu:29,uncontrol:3,ecompute_convex:5,mem:4,travers:[22,26],debugg:[30,25,14,7],startpos:3,pxcachealloc:29,particl:[4,17,25],met:[32,6,22],pxconstraint:[4,22,29],collectforexport:4,believ:[3,21],pxmat44:[4,7],einitial_overlap:4,typic:[18,22,3,5,6,28,7,8,11,12,29,31,26],hull:[30,15,5,13],updatevehicl:4,linker:10,knock:29,copyright:32,sacrific:[28,29],click:19,getpvddatastream:4,buildtool:1,traction:3,drove:3,compris:[19,28,21,6],fdani:[4,26],minimis:3,enorm_tire_aligning_mo:4,mtype:3,substepcount:18,afford:3,upper:[3,4,28,22],deltaheight:30,interleav:[18,6],centroid:20,pxclosevehiclesdk:[3,4],rule:[18,20,2,3,6,7,8,11,12,29,31,26],bouncethreshold:29,itself:[0,18,3,8,27,31],exit:[18,20],pxvehicleupdatecmasslocalpos:3,blend:3,plain:[4,12],real:[0,21,3,5,8,9,28,12,31,29],former:[3,4,27],wai:[3,4,5,6,13,8,9,26,28,30,31,0,1,7,11,12,18,19,20,22,29,27,32],physxvisualdebuggersdk:1,pxreal:[20,22,3,4,5,26,9,28,30,31,29],recip:2,elimits_are_forc:22,shiftorigin:23,differenti:17,bump:[3,31,8],visibl:[18,21,3,6,8,30,26],ediscrete_respons:4,trigeom:11,big:[23,12,8],somewher:[3,8],bring:[3,21,22],bia:3,geom0:[20,29],geom1:[20,29],childpos:28,ebrake_control:4,edisable_ccd_resweep:31,pxshapeflag:[4,11,31,14,8],bit:[18,1,3,5,7,8,27,11,29,30,26],runprofil:6,variou:[0,18,21,22,3,4,8,27,12,30,29],audio:[3,21],field:[28,7,8],pxcontrollerfilt:8,balanc:[3,4,31,6],estiff:4,px_assert:[5,2],rais:[18,4,13,29,11,30,15,24,31],simultan:[22,3,4,6,11,12,31],actuat:[28,21,17],technolog:[3,27,21],wobbl:29,settransform:29,submiss:6,setdriv:[28,22],rigidli:22,influenc:[0,3,4,29,28,31],minehead:11,placement:[3,30],sqdesc:3,broadphas:[30,11,5,24,16],multipl:[27,20,21,22,4,5,25,28,7,29,9,11,30,31,17],raii:6,actual:[3,29,10,12,30,8],backward:[3,4,20,7],"import":[0,18,21,2,3,5,6,7,8,27,10,12,13,30,31,29],reset:[18,3,4,29,28,26],emax_num_title_char:4,unrealist:[3,30,8],enotify_threshold_force_persist:31,eprevent_climb:8,situat:[18,22,3,6,8,27,12,30,31,29],maxedgelength:8,cours:[3,21],storag:[3,5,7],px_def_bin_metadata_item:2,gain:[3,26],assert:25,principl:7,practic:[28,25,31],obsolet:13,disconnect:[0,3],renderbaseactor:5,longitudinalslip:[3,4],"public":[0,18,1,2,4,6,7,10,31],state:[21,22,23,4,7,8,27,28,30,15,17,31],abil:[3,22],pxsetprofilercallback:0,setvisualdebuggerflag:4,pxbounds3:[20,5,26],setmassandupdateinertia:30,graphsizei:3,contribut:[22,8],flt_max:22,maximaljointforc:28,batchqueri:3,individu:[20,22,3,4,29,28,14,30,31,24],prismat:[25,28,17],nbhit:[4,26],rotat:[22,5,7,29,27,28,8,17],setwheeldata:3,coefficientmatrixs:28,north:29,"_build":21,enotify_touch_found:[11,16,31],singleton:16,infinit:[20,22,29,9,31,8],act:[0,22,3,4,29,28,31],effici:[3,5,6,8,11,12,14,29,31,26],underli:[22,8],pxraycastbuff:[4,26],hadblock:26,getlin:14,px_check:[18,2],encount:[22,3,4,8,30,26,24],pxvehicleupd:[3,4],pxactortypeflag:[4,13],deploy:1,mymateri:[27,11,5],dcc:12,live:12,ragdol:[30,11,28,22,31],echannel_engine_drive_torqu:[3,4],goe:31,count:25,insid:[18,20,3,5,7,27,10,11,31],aboxshap:5,setrunonspu:4,deposit:28,newli:[4,6,13,26,11,12,30,31,16],previou:[3,4,13,26,12,29,30,31,16],px_serial_object_id_invalid:12,lie:[5,22],setdigitalsteerright:3,extrem:[3,30,15,20,31],efront_left_wheel:4,impulsiveforc:29,lib:[12,16,1],sequenti:[3,18],glass:20,perform:[25,7],popul:[25,20,17,22],srcmetadata:12,pxvehicleconcurrentupdatedata:3,understand:[21,22,3,29,11,30,8],client:[9,16],e4th_from_front_right:4,windmil:29,sim:[25,17],meshsizeperformancetradeoff:[16,30,5,13],fabric:[4,13],nextitemset:31,nxuseroutputstream:27,capac:[18,30,15],pxvehicleisinair:4,total:[3,4,8,23,11,30,15,26,31],setqueryfilterdata:[3,26],stepper:9,sprungmasscoordin:3,fall:[0,3,5,28,8,11,15,29],revisit:[11,5],pxdefaultmemoryoutputstream:5,steer:[17,22],refactor:4,secondari:3,primarili:7,erestitut:22,bvhdesc:26,maxhit:[4,20],solverprep:22,malloc:[18,27,12,7],blockdist:26,onsleep:[18,6,29],pxgetgaussmapvertexlimitforplatform:[16,13],common:[2,22,4,5,25,7,8,28,29,30,16,17],richer:[16,13],es16_tm:[30,5],overlai:14,tankdrivesimdata:3,nx_ignore_pair:27,emesh_multipl:20,extrud:[13,8],enorm_tire_lat_forc:4,numrow:5,obtain:[20,22,5,6,13,29,10,28,12,7,30],tire_type_norm:3,grip:3,px_serial_align:2,acquirerefer:[13,7],optim:[0,20,1,22,3,6,26,27,11,12,30,31],etank_wheel_5th_from_front_left:4,pxallocatorcallback:[18,27,2,7],intempbuff:2,resolv:[2,22,3,4,5,6,8,12,30,31,29],swept:[3,4,20,26],unit:25,throughput:6,hitflag:20,pxphysicsapi:[27,1],getindexbuff:5,scienc:3,groups1:27,computekinematicjacobian:28,unmodifi:20,susptraveldirect:3,pxbaseflag:2,shrink:[18,30],inputdata:12,static_cast:[28,29,12,26],elapsedtim:8,subclass:[18,6,26,27,10,31],defici:27,glanc:3,regardless:[4,5,7,8,11,29,26],scale:[30,31,25,14,7],eenable_pcm:[30,15,31,13],endian:12,cosin:8,pxinitextens:[27,10],nxshape:27,snowman:29,intro:3,lose:[3,11],angularmotionveloc:29,dispos:10,pxvehicledrive4w:4,auto:25,minheight:30,pxphysicsinsertioncallback:[30,10,13],trackoutstandingalloc:[18,10],abbrevi:20,convexbox:11,vehicl:[21,1,22,25,30,15,17,31],resolverefer:2,furthermor:[18,4,21,6,14],pxi32:4,extern:[20,4,29,28,12,8,17],reduc:[31,25],merchant:32,behav:[3,28,31,22,29],pxconvexmeshcookingtyp:[16,13],anywai:8,inaccur:3,pxvec3:[20,22,3,4,5,6,7,8,27,23,28,29,30,15,26,31],door:[31,22],doom:8,bypass:[11,29,26],detail:[2,4,5,6,13,9,10,17,26,30,31,7,11,12,24,18,20,21,22,23,29,27],etireload:4,next:[18,22,3,4,6,7,8,27,12,13,29,30,31,26],packjointdata:28,likelihood:[3,31],elimit_en:22,correlationdist:29,latenc:[3,30,6],edisable_mesh_valid:5,shrunk:30,tiresurfacemateri:4,obstaclehandl:8,compart:17,counter:[3,4,29],limb:[31,8],associ:[22,3,4,5,6,7,29,15],pxvec2:7,seek_end:12,broken:[6,22],renam:[27,4,16,13],pxqueryflag:[4,17],hook:[27,10],radiu:[22,3,4,5,8,27],dostuff:4,slightli:[22,3,4,5,28,8,27,11,30,31],advers:15,shapetransform:30,vanilla:3,alter:[3,28],pxvehiclesetupdatemod:3,incur:[30,15,28,20,29],bulk:10,throttl:3,dampingratio:3,spike:[30,24],snapshot:[3,28,12],maintain:[21,22,3,7,8,28,12,29,26],penalti:[20,5],pxcomputeheightfieldpenetr:[20,16,13],unlockwrit:6,pxrigidactor:[20,22,4,5,7,17],snowbal:[29,8],"80kg":3,straight:[3,8],mysimulationeventcallback:22,node:22,prop:2,tune:25,cb2ca:28,eanalog_input_thrust_left:4,heighfield:[25,15,17],"function":[2,5,25,7,8,28,30,31],artist:3,push_back:[4,31],eprecise_sweep:[4,20],angular0:22,nx_skin_width:27,gettriangl:[20,5],createcollect:[4,12],overal:[30,31,6,29],unbal:26,produc:[20,16,2,22,3,4,5,7,26,9,10,28,12,29,30,15,8,31],myob:4,setscenepvdflag:0,evalu:5,mandatori:[31,6,26],outputstream:4,"2nd":[3,8],last:[20,16,3,4,13,8,9,29,31,26],getscen:[4,11],cct:[30,25],erigid_dynam:[11,12],immut:[11,15,29],popular:[30,7],"transient":[3,15],crouch:8,eis_releas:2,resultbuff:4,pxserializationregistri:[3,4,12,2],imaginari:28,runtim:25,concaten:5,encod:[3,30,11,22],setgrav:29,impuls:[28,29,31,22,8],maxheight:30,glean:11,light:[3,31,22,29],epostfilt:17,pxcctnonwalkablemod:4,contactimpuls:[31,13],getphysicsinsertioncallback:[30,10,5,13,26],glpopmatrix:7,lockread:6,oncomshift:22,thousand:15,getfootposit:8,usergethfshap:5,trail:8,meshweldtoler:5,affect:[3,4,6,13,29,28,30,15,16,31],copyinternalstatetocach:28,econtact_event_pos:31,abp:[11,15],instal:[27,16,12],expect:[3,5,26,27,10,11,12,30,31,29,24],px1dconstraintflag:22,detect:25,efront_right:[3,4],setprojectionlineartoler:22,mytrimeshshap:5,extent:[3,11,5,22,8],drivesimdata:3,coeffici:[27,28,29],omit:[10,20,1],pass:[18,22,3,4,5,6,28,7,8,27,10,11,12,13,29,30,31,26],dash:30,instuct:5,assess:27,jointforc:28,pxcudacontextmanagerdesc:[15,6],extend:[25,8],own:[0,18,21,1,22,3,6,28,7,8,27,11,12,13,31,29],nextcontact:31,weaker:3,stage:[3,6,1,30],pxconstraintflag:[4,22,13],elast:29,aforement:8,guserphysxtyp:2,structur:[18,20,2,22,3,5,6,28,26,27,23,11,29,30,31,16,24],setscenequeryfilterdata:[3,4],linux:[19,15,16,12,2],pxarticulationreducedcoordin:[28,16],record:[0,18,21,22,3,4,26,28,12,14,29],termin:[4,28,31,10],iscomplet:18,figur:[3,6,12],param:5,guess:3,eshap:12,stateless:22,uniform:[27,31,29],src:[3,2],fifo:6,bodybtoworld:22,row2:5,row1:5,row0:5,retain:[32,29],dimens:[3,30],imperfect:[3,8],mnbvert:5,vnext:22,ineffici:3,ppu:4,strategi:[3,30,8,5,26],debri:[30,11,5],thrust:3,sweep:[30,31,5,25,8],pxprofilezon:16,repx:25,wheelsdyndata:3,gettyp:[4,5],advis:[3,32,2],getbroadphasecap:11,setprismaticjointflag:22,linearmotionveloc:29,contactdist:[28,31,22,7,29],meshactor:11,deviat:3,protect:[6,7],report:25,infrastructur:22,telemetri:17,mnbmateri:12,baseflag:2,notori:8,setjointtyp:28,anticip:3,scientif:3,unbound:[4,17,25,22],save:[2,3,4,5,26,27,10,11,12,30,29],convexvert:5,suspend:[3,11],"3rd":8,setcontactmodifycallback:31,outsolverbodydata:29,legal:[18,3,6,29,27,28,31],trg:3,tri:[0,30,20,5,22],http:[0,3],wikihow:3,reason:[18,22,3,6,7,8,27,10,28,12,30,31,29],match:[3,4,28,12],strong:[3,11,28,8],setgeardown:3,seidel:[25,21,17],samplecustomgrav:[29,8],outcom:29,retriev:[20,22,5,28,7,8,11,29,30,16,24],llindex:28,fluiditi:31,efront_right_wheel:4,smoother:[3,31],migrat:[25,21,17],"enum":[16,3,4,5,7,8,11,13,29,26],edeleted_shape_oth:13,string:[12,7],warranti:32,midphas:[5,26],priorit:22,govern:[3,18,5],retest:3,sampl:[31,25,7],bitmask:26,exchang:27,orient:[20,21,22,3,5,28],exportextradata:2,rectangl:5,registerseri:2,care:[22,3,6,7,29,9,11],vulner:26,crab:[11,6],dispar:20,simplist:[3,18,11],mmaxnormalisedload:3,pxserial:[3,4,12,2],thin:[20,31],cleanup:4,introduct:25,oper:[0,3,4,5,6,28,7,26,23,11,12,30,31],invdt:29,computejointforc:28,mirror:13,has16bitindic:5,restitut:[22,25,8,28,31,17],clutch:17,mcrab:6,effort:[3,31,8],snippetimmediatemod:29,ellips:28,xmlmemoryalloc:2,camberangl:3,open:[3,12],scheme:[3,29,27,11,30,31,16],econstrain:8,columnscal:[30,5],schema:12,apart:[18,31,5],port:[0,27,8],wiki:3,equantize_input:5,getwheelshapemap:4,hfshape:5,setup:[3,4,6,26,11,25,16,17],scalar:[28,31],quantifi:3,px_profile_zon:16,behind:[3,20],outermost:30,result:[27,20,1,22,4,5,25,28,7,8,9,11,13,30,15,29,17,31],nextpatch:31,nbheader:29,pxdelayloadhook:[16,17],innov:21,sudden:22,max_num_tire_typ:3,prototyp:[3,11,12],particularli:[18,22,3,5,29,11],erevolut:28,intrus:1,mask:[11,8],slip:3,copi:[18,22,3,4,5,28,26,11,12,31,24],exce:[18,22,3,29,30,15,31],addition:[0,18,21,3,6,28,8,11,26],pxbvh33midphasedesc:[16,30,5,13],heightfieldgeom:20,mass:[5,25,7,28,30,31,17],tangentialspr:4,nxshapedesc:27,might:[18,16,1,3,5,6,13,8,11,12,14,29,30,31,26,24],squar:[3,5,29],shapea:[27,12],shapeb:[27,12],pxbatchquerypostfiltershad:26,restoffset:31,fragil:18,maxconstraint:22,shift:[25,31,5,17,22],"__file__":[18,7],pcm:25,capsuledesc:27,demonstr:[18,22,3,4,5,8,12,15,29,31],benefici:[18,1],instanti:[3,4,5,12],ebvh33:5,classic:[21,29],colscal:5,eout_of_memori:7,project:25,imbal:3,spheric:[25,20,17,28,8],isserializ:[4,12],vcurrent:22,eassume_no_initial_overlap:[4,20],amd:7,isvehicleinair:4,attribut:[22,3,5,6,26,11],counterpart:[3,4,8],e9th_from_front_left:4,relev:[3,27,29],estatic_aabb_tre:17,fclose:12,setdriveveloc:[28,22],group2:[27,29,26],group3:26,group1:[27,29,26],bake:27,group4:26,computelinearangularimpuls:31,ebroken:22,convexdesc:5,cleartorqu:[18,4],motion:[22,3,5,28,8,9,11,30,31,29],samplebuff:5,rubberi:22,displai:[18,15,12],everi:[0,21,22,3,7,29,10,28,12,31,24],inspect:[0,3,21,12],veloc:[31,25],pxvehiclewheelsdyndata:[3,4],pxvehicledrive4wsmoothdigitalrawinputsandsetanaloginput:3,displac:[3,8],hullpolygon:5,getnam:6,sampleallocat:6,chassiscmoffset:3,remedi:3,unsign:[27,4,12],pxvehicledifferentialnwdata:3,e7th_from_front_left:4,sequenc:[22,25,7,29,9,30,17],hamper:3,pxdeserializationcontext:2,getsusplinedir:4,longer:[22,3,4,13,8,27,10,11,12,31,29],nx_max_angular_veloc:27,impos:[3,18],tempt:[9,8],summari:[3,24],createcloth:4,skip:[20,8,12,13,26],vertex:[30,15,25],pxbatchquerydesc:[3,4,26],setsuspforceapppointoffset:3,angular1:22,partak:22,assist:[30,22],unavoid:3,setrigiddynamicflag:11,pxvehicleautoboxdata:17,setlat:3,squish:29,fail:[22,3,5,6,8,10,12,30,31,29],physxsampl:5,speed:[0,1,3,5,7,26,27,11,29,31,8],pxcapsuleobstacl:8,lowest:[3,6],found:[18,19,20,2,22,3,4,6,28,7,8,11,12,15,26],pxvehicleautogeneratedmetadataobjectnam:2,"1st":8,createserializationregistri:[3,4,12],run:[18,19,21,1,2,22,3,4,6,28,26,9,11,12,14,29,30,15,16,24,31],still:[18,1,3,4,5,6,23,13,8,10,29,30,15,16,31],espr:22,pxvehiclewheelqueryresult:[3,4],eenable_stabil:30,defer:[3,7,26],euser_releas:4,pxbvhstructuredesc:26,collision_flag_ground:3,greater:[20,3,5,8,9,31],appli:[20,21,22,4,5,25,28,7,8,11,14,30,15,17,31],filtershaderdatas:27,choos:[3,4,31,22,30],overst:21,pxprismaticjointflag:22,mnumratio:3,unlock:[28,17],pxspatialoverlapcallback:27,reiniti:7,pxbroadphasecallback:[27,11],nearer:3,nxjointlimitsoftdesc:27,nxu32:27,easi:[19,21,3,7,8,10,12,31],isparticlebas:13,spring:[3,4,28,27,22],enhanc:[25,17],exhibit:[15,28],enodr:4,resiz:[30,8,26],objecttofileimpl:2,createpruningstructur:26,mitsubishi:3,pxvehiclecomputetireforc:3,physx_:1,e8th_from_front_right:4,addarticul:28,nbvert:5,pxtrianglemeshdesc:5,getwakecount:29,deadlock:6,terraindata:30,elarge_triangl:5,pxphysic:[0,22,3,4,5,7,26,27,10,11,12,13,31,29],depth:25,plateau:3,abov:[20,32,2,22,3,5,6,13,8,27,11,29,30,15,26,31],getconjug:28,control:[28,25,31,7],solid:[20,29],somewhat:[22,7],addcollect:[4,12],emax_distance_en:22,terrain:[3,30,5],egraph_type_engin:4,populateregion:11,region:[25,20,17],transforminv:28,realli:[3,11],absent:15,x86_64:16,dumpbinarymetadata:[4,12],describ:[18,20,21,3,5,28,26,27,10,11,12,30,31],getdiffdata:3,interlud:3,pxcreatecontrollermanag:8,pxvehicleantirollbar:17,enumtyp:2,pxrepxobject:2,transit:[3,4,13,29],normalrejectangl:3,distanc:[20,5,25,8,28,30,15,17,31],step:[30,31,25],matrix:[27,28,7,29],isrigidbodi:13,cheaper:[3,31,22],nbvelocityiter:29,dramat:29,vari:[3,30,20,22],px_call_conv:3,relativepos:5,keep:[22,3,5,6,7,8,23,28,12,13,29,15,26,24,31],pxtoolkit:[4,5,10],pxrevolutejointflag:22,inform:[25,7],thick:[5,13],"char":[0,18,2,3,4,6,7,10],pxvehiclecomputesprungmass:3,magnitud:[22,3,4,29,31,8],customtypeextensionapi:2,instead:[20,16,1,22,3,4,5,28,13,8,27,10,11,12,29,30,31,26],arbitrari:[20,3,8,27,11,31,26],forbidden:18,nv_use_debug_wincrt:1,special:[0,20,21,2,22,3,4,5,28,7,26,27,11,12,29,31,8,32],interest:[25,28,17,31,14],prep:15,nxwheelshap:3,pxcontactpair:[4,31,13],typeid:22,ebuild_enabled_commit_dis:16,mcudacontextmanag:6,pxrevolutejointcr:22,forcelimit:[28,22],topolog:[5,26],etight_bound:[30,5],construct:[22,3,4,5,7,29,28],script:[16,1,2],physxcor:7,actorinputdata:12,freespin:22,elong:3,breakpoint:7,sign:[30,10],etank_wheel_2nd_from_front_right:4,appguid:6,myobserv:4,inherit:[2,22,4,5,7,27],shorten:26,autowak:[4,29],braketorqu:3,erear_right:[3,4],sixtieth:9,ontrigg:[18,11,6],gaug:3,ever:[3,18,30,29],back:[0,18,20,22,3,4,6,28,8,27,11,29,30,15,16,31],intern:[30,28,25],fseek:12,pxquat:[3,4,5,7,27,28,30],easiest:29,execut:[18,19,22,3,4,6,26,27,11,30],overwritten:2,mrearleftrightsplit:3,etank_wheel_1st_from_front_left:4,lowlimit:28,angular:[22,3,29,28,30,31,16],chang:[21,22,25,7,8,28,30,31,17],linearveloc:31,invinertiascal:[31,29],chanc:[23,31,6,26],amplitud:3,edriv:4,fals:[20,1,2,4,5,13,8,9,11],otheractor:31,parentattach:28,nxutillib:17,wheelgraphposi:3,concept:[25,28,21,17,29],pxident:[3,4,5,7,29,12,30],ediff_type_ls_4wd:3,assum:[22,3,5,6,28,26,11,12,30,31,29],black:30,epre_solver_veloc:31,math:25,shapegroup0:27,shapegroup1:27,recomput:[3,30,26],signatur:[10,20,16,22,13],few:[19,3,8,30,31,26],createobject:2,pxsceneflag:[30,4,15,17,31],serializecollectiontoxml:[4,12],ralli:3,pxvehiclesetbasisvector:3,collision_flag_wheel:3,getnbbroadphaseregion:11,isspher:27,getcontactpoint:31,safeti:[3,4,20],pairhead:[18,31],zoneend:0,recommend:[0,28,16,1,22,3,5,23,7,26,27,10,11,13,30,31,8],machin:6,versa:[30,11],mod:22,pxcontactpairpoint:31,pxconstraintshadert:22,esuppress_eager_scene_query_refit:16,pname:2,within:[18,21,22,3,4,5,6,28,7,8,11,12,29,30,15,26,31],patholog:26,setgearup:3,versu:3,centrifug:28,lockwrit:6,plu:[2,7],procur:32,right:[19,32,3,5,7,8,12,29,26],integ:[30,5,7],thumb:[3,31],regist:[0,18,2,4,5,13,29,10,28],backfac:20,intel:7,linearvelocityactor0:31,consider:[28,25,31],gimbal:[28,22],"switch":[1,3,4,5,6,8,11,12,30,16],maxsweepsperexecut:4,filterflag:4,ebest_poss:3,pxvehiclemodifywheelcontact:3,pxscenequeryhit:[4,13],inter:[31,6],dynamictreerebuildratehint:[30,26],francisco:8,abl:[18,20,22,3,4,5,6,28,13,8,10,11,12,29,30,15,16,31],ehandbrake_control:4,shader:[15,17,31],actormemory128:12,propag:[3,23,28],etank_wheel_2nd_from_front_left:4,procedur:[3,12],pxsolverbodydata:29,graphic:[31,25,7],lateralslip:4,react:[0,3,18],util:[18,3,5,7,27,10,28,12],modif:25,arithmet:[15,21],revolut:[25,28,17],pxmeshoverlaputil:[4,13],nxusercontactreport:27,pxcloseextens:10,refer:25,arrang:[24,4,5,6],setanaloghandbrak:3,recoveri:[28,25],sweepshap:26,einternal_contacts_are_flip:31,scene:[28,25,31,7],runclang_:2,totallinkcount:28,reveal:3,setmass:[3,30,29],impli:[32,12],timestep:[22,3,4,29,27,8],echannel_clutch_slip:[3,4],time:[30,28,25,31,7],meshgeom:20,button:3,getmass:3,bitwis:26,setswinglimiten:28,just:[18,22,3,5,28,7,8,10,11,12,29,30,31,26],capsuledens:5,sphereshap:4,trip:3,volum:25,word0:[3,4,11,27,26],deal:[9,30,31,12,8],word2:[3,26],createbatchqueri:[3,4,26],continu:[25,7],constantblocks:[3,4,11,27,31],equal:[3,30,6,22,29],nxusertriggerreport:27,cctfiltercb:8,aggreg:[30,25,17],equat:[22,3,5,7,29,26],max_id:12,vehiclewheelqueryresult:[3,4],pxvehiclesuspensionsweep:3,boxshap:4,user:25,altern:[0,18,22,3,4,5,7,8,10,11,12,13,29,30,31,16],egpu_compat:[30,15],gone:[27,16],rememb:[3,12,8],should:[2,3,4,5,6,13,8,9,14,15,26,28,30,31,0,1,7,11,12,16,20,21,22,29,27],hope:8,element:[3,30,7],gcook:[27,26],pictur:8,heapcapac:15,aren:31,triggershap:11,mminnormalisedload:3,larger:[22,3,4,5,29,23,28,30,31],elock_angular_x:29,efix:28,getenginerotationspe:3,convert:[2,4,5,7,8,27,28,30,26],pxcollet:12,eenable_two_directional_frict:4,ewheel_omega:4,markserializedmem:2,familiar:[28,22],navig:[3,8],converg:[28,22,29],unexpect:30,pxbinaryconvert:[4,12,2],pxvehicl:2,ecooking_perform:5,myccdcontactmodif:31,overlapaabbshap:27,faster:[0,20,22,3,4,5,8,11,12,30,26],d6joint:22,eeasi:8,check:[31,7],accordingli:[3,23,13],brake:[3,22],recurs:[18,8],paragraph:26,pxrigiddynamicflag:11,wider:[3,29],minpositionit:29,sort:[3,31,8],col:30,decemb:[25,17],snippetvehiclewheelcontactmod:3,setcontinu:[18,6],collis:25,finish:[3,9,6,18,26],prefer:[18,1,4,5,6,7,10,14,30],bear:[3,22],eabp:[30,11,16],pivot:30,posetosweep:20,wrap:6,whose:[20,22,3,5,6,28,7,8,11,31,29],mysteri:1,createcontrol:8,though:[18,3,7,8,27,23,11,30,29],trianglecount:20,estat:[4,17],launch:[0,4,19],friendli:29,unrespons:3,categor:30,"float":[22,23,5,7,8,27,30,31],numbound:26,pxtriggerpairflag:[11,6,13],getactivetransform:16,low:[3,5,11,29,28,30,15],pxqueryreport:4,lot:[5,29,27,30,31,8],nxcollisiongroup:27,statist:[25,17],concurr:[18,3,6,11,12,31],grant:6,eheightfield:11,highpeakpress:3,log:[28,6,7,26],pairflag:[3,4,11,27,31],child:[28,22],lod:[3,6],settodefault:5,race:[3,6,1,22],classifi:12,realiti:6,daili:1,flushscenequeryupd:26,delai:[25,17],etank_wheel_9th_from_front_left:4,crabpo:6,getscenequeryfilterdata:4,pxoutputstream:[4,12,2],contactbuffercapac:15,etank_wheel_9th_from_front_right:4,frequenc:[3,18,30,29],roll:[17,8],heart:8,actorb:[27,29],indirectli:[6,29],role:[3,11,22],mmaxsteer:3,pitfal:30,block:[22,4,5,25,8,9,11,30,17],clamp:[20,3,26,28,30,31],specif:[20,16,1,22,32,4,28,7,8,11,13,29,30,26,17],eactor_ax:14,buffers:[20,31,26],idx:2,interact:[30,28,25,31],corioli:28,two:[18,20,21,2,22,3,4,5,6,28,8,27,11,12,29,30,15,26,31],predict:[3,29,8],issu:[31,25,7],pxcloth:[27,4],robot:[28,21,1],euv:[20,26],clang:[16,2],conflict:[3,1,22,8],entiti:[11,21,8],law:[3,21,22,29],filter:[20,22,25,8,28,15,17,31],feedback:28,graviti:[4,25,7,8,28,15,17,31],unifi:[30,4,16,13],pxdefaultsimulationfiltershad:[27,11,15],evelocity_chang:[3,29],applydamag:26,queryflag:3,strength:[3,28,22],emtd:[30,20],lag:22,stronger:[3,27],treasur:11,pxcreatejointconstraintswithshad:29,frequent:[30,31,26],ba2w:22,consist:[20,22,3,4,5,6,7,27,10,28,12,30],lockparticledata:4,edynam:[4,17],setbraketorqu:3,undocu:7,bodya:22,sharp:5,request:[18,22,6,7,29,30,15,31],desir:[18,22,3,26,5,28,8,11,29,30,31,16],setgeometri:5,lighter:[31,22],segment:[20,5],maxdist:[20,26],alongsid:19,ricochet:31,pxpvdinstrumentationflag:[0,10,30],lighten:30,snippetpxbvhstructur:26,techniqu:[3,28],pxraycastbuffern:4,snippethellogrb:15,grid:[30,11,5,26],overlapcapsuleshap:27,nxphysic:27,wrapper:5,direct3d:7,quat:5,middlewar:6,quak:8,snippetvehiclecontactmod:3,physxgpu:[10,1],moi:3,incred:3,period:[3,28,29],quad:[20,22],setlimitcon:22,cab:18,dread:22,addlocalforceatpo:29,substeps:18,breakabl:[28,22],through:[30,28,31,7],can:[2,3,4,5,6,13,8,10,14,15,26,28,30,31,0,1,7,11,12,16,24,18,19,20,21,22,23,29,27],car:[31,8],"0xcd":2,convexmemory128:12,pxarticulationlink:[9,28,31,22,26],eanalog_input_accel:[3,4],pxvehiclesdk:1,sweephit:20,contactpoint:[31,13],setprecisesweep:8,expend:3,ellipt:[28,22],wildli:3,inlin:27,compon:[18,22,25,8,30,15,29,17],incallback:2,sight:[20,26],"18m":30,year:8,shutdown:[19,5,17,25],certain:[28,22,3,4,23,7,29,10,11,30,31,24],expens:[18,20,22,3,5,7,26,23,28,30,31],linearimpuls:31,pxserialobjectid:[4,12],compos:[21,7],high:[21,1,22,3,26,11,15,29,31],privat:2,additionali:30,px_physx_core_api:29,fastest:5,heightfield:25,ecct_slid:8,pxextendedvec3:8,exert:29,job:[18,6],approx:3,difficult:[3,27,15,30,8],fidel:18,edisable_simul:[11,29],pxscene:31,pxfoundat:[18,3,13,10,7,16],outlin:[30,28],emesh_ani:[20,13,26],customclasstyp:2,commensur:3,anyth:[29,26],raycasthitbuffers:4,intuit:[3,22,26],reentrant:31,vertexlimit:5,paramet:[28,31,7],pxcontrollernonwalkablemod:[4,8],espu_raycast:4,requisit:2,measur:[3,30,24],pxarticulationjointdrivetyp:28,setrevolutejointlimit:22,setparam:5,euler:28,droop:3,meet:[3,30,20,6],normal:[20,3,5,29,30,31,26],customproperti:2,extradatastreams:31,levit:8,querymemori:3,duplic:[3,5],silent:30,pxsweepcallback:[4,26],setmot:[28,22],neither:[22,3,8,27,31,32],degrad:[11,26],foundat:[21,25,7,27,16,17],argument:[18,20,22,3,4,5,7,26,11,13],extractcontact:[4,31,13],opt:3,whether:[20,32,1,2,22,3,4,5,6,13,26,10,11,12,15,29,24,31],thereaft:[7,26],camberangleatmaxcompress:3,sooner:31,forward:[18,8,9,28,30,17],hasimpuls:31,einternal_has_face_indic:13,famili:3,reoder:30,pxconvexmeshgeometryflag:[30,5],statu:[20,5,6,26],tangenti:[28,31],gettiredrivablesurfacecontactpoint:4,snippetcustomprofil:0,repxobject:4,getbinarymetadata:2,either:[3,4,5,6,13,8,10,26,28,30,31,0,1,11,12,16,18,20,21,22,29,27],createbinaryconvert:[4,12],whatev:[28,7],redistribut:25,evehicle_type_drive4w:4,invest:7,senderrormessag:4,snippetvehicletank:17,pxdiagon:[27,29],gfoundat:[0,10,15],mdownratio:3,revers:[3,20,5,6],old:[16,4,5,12,13],getnbtriangl:5,heavi:[3,15,22,29],revert:3,setactorpairflag:27,deseri:[4,11,17,2,7],addobstacl:8,px_physics_vers:[0,10,16,7],createserialobjectid:12,getphysxfoundationdllnam:[10,16],spot:3,vertexindic:20,echeck_zero_area_triangl:5,trimeshid:12,distro:21,sizeof:[2,5,11,26,27,28,30],heurist:2,decid:[21,2,3,6,8,11],getnbwheelgraph:4,pxvehicletiredata:[4,17],reclaim:[18,30],ramif:4,jointveloc:28,red:30,"class":25,pvddatastream:0,pxplatform:13,obb:5,rel:[22,3,5,29,28,12,30,15,31],deriv:[0,22,28,12,31,32],disabledpropertyentri:2,pxcollect:[4,12,2],exploit:3,slow:31,hardcod:26,steerangl:[3,4],mdampingratezerothrottleclutchengag:3,hfdesc:5,avoid:[0,18,2,22,3,6,23,13,8,10,28,31,26],writedata:2,eprofil:[0,30],overcom:[3,30,28,29],boundari:[20,12],eanalog_input__accel:3,meantim:4,border:5,suffer:[23,30,11,22,8],regress:30,logic:[18,7,26,27,11,30],childattach:28,reus:[3,18,30,8],ekeepbia:22,compat:[20,1,3,4,7,12,30,15],compar:[20,3,5,13,8,27,30,15,16,31],maccuraci:3,wheelomega:3,vector:[28,25,7],pxvehiclewheelssimdata:[4,17],erear_right_wheel:4,einternal_error:7,getposit:8,rev:3,entireti:12,pxconstructsolverbodi:29,setsimulationeventcallback:18,e7th_from_front_right:4,between:[3,4,5,6,13,8,9,10,26,28,30,31,7,11,12,16,18,20,21,22,29,27],slot:6,macro:[10,16,7],myfiltershad:3,invalidatecach:8,store:[0,18,20,1,22,3,4,5,6,28,7,26,27,11,12,29,30,15,16],etank_wheel_3rd_from_front_right:4,doblock:4,getstepoffset:8,px_max_nb_wheel:[3,4],watch:[0,3],summar:8,distinguish:[4,6,26],maximum:[18,20,1,22,3,4,5,28,8,27,11,29,30,15,26,31],learn:[19,21],bouncethresholdveloc:27,linear:[8,22,3,29,31,16],cumbersom:23,"short":[3,30],present:[19,22,3,29,12,30],denomin:3,pxvehiclegearsdata:17,pxarticulationjointbas:16,pxmeshmidphas:[30,5],plane:[15,25,31,7],preparedata:22,"_debug":5,target:[18,1,2,22,3,4,5,29,9,10,28,12,31],hash:[27,29],pvd_host:[0,10],contactcount:31,connect:25,sweepresultss:3,midphasedesc:[30,5],pxfindfaceindex:20,enotify_touch_ccd:31,pxvehiclechassisdata:3,pxserializerdefaultadapt:2,pxmateri:[3,5,7,29,11,12],give:[20,22,29,27,11,12,31,24],gcudacontextmanag:15,refitbvh:5,metric:31,take:[18,27,16,22,3,4,5,6,7,8,9,11,21,12,13,29,30,15,26,31],prevent:[18,22,3,6,7,8,30,31,26],wheelssimdata:3,shaderdata:3,setdominancegrouppair:29,road:3,dual:2,setwakecount:[18,4,29],nbtri:5,togeth:[20,21,22,3,28,26,11,12,31,29],candid:[11,26],emit:[15,5,31],pxgeometrytyp:5,vehactor:3,springi:22,aligndata:2,massless:29,pxmat33:[4,7],nutshel:31,plan:0,would:[18,20,22,3,4,28,13,8,27,11,12,30,31,29],pxconstraintbatchhead:29,orphan:12,forget:[3,10],eanalog_input_brake_left:4,essenti:[3,27],skinwidth:[27,16],dedic:[4,20,8],cookingparam:5,nx_default_sleep_energi:27,vmotion:22,thecook:5,wheelcoord:3,lead:[21,3,6,8,28,12,29,30,31,26],obj:2,leaf:5,highlight:[4,16,13],serializationregistri:3,"5x5":5,advanc:25,eimpact:4,layout:[3,5,12,7],emax_nb_sampl:3,initialst:28,smallest:3,height:[30,8],phor:3,leav:[3,6,8,12,31,26],muscl:22,spherepos:20,pxvolumecach:4,setsphericaljointflag:22,until:[18,20,22,3,4,5,6,7,8,9,11,12,29,30,31,26],counteract:28,pxinputstream:12,conjunct:[3,30,15,20,31],externalrefer:22,emax_num_drivetank_analog_input:4,him:8,echannel_jounc:[3,4],faceindic:5,settypepairfrict:3,constitut:31,pxd6joint:[4,28,16,22],mfoundat:10,deadlin:30,hit:[30,25],central:31,trough:30,rendersphereactor:5,ediff_type_ls_frontwd:3,physxfoundation_:[16,1],localframe0:22,maxvert:5,queu:[6,26],setangulardamp:29,batchhead:29,pxcreatepvd:[0,10],getlinearlimit:[16,22],manner:[3,5,8],mai:[2,3,4,5,6,13,8,9,10,15,26,28,30,31,1,7,11,12,18,20,21,22,23,29,27,32],anyhit:13,readbuff:5,set:[2,3,4,5,6,13,8,10,14,15,26,28,30,31,0,1,7,11,12,16,18,20,22,29,27],numid:12,spatial:[22,3,13,26,11,7],fifth:3,load:[4,5,25,29,27,12,30,17],nbtriangl:20,nxm:28,esubmarin:11,aaaabbbbccccdddd:30,independ:[3,4,6,8,11,12,30,15,29],calcul:[20,22,3,4,6,29,28,30,31],middl:8,design:[21,1,22,25,7,28,30,17],triangleindex:5,degtorad:8,unscal:30,packag:[3,6,22,8],typedef:[3,22],ps3:27,latter:[3,30,12],wheeltodis:3,host:15,peak:3,receiv:[18,22,3,4,6,13,26,27,12,31,29],sweepdist:26,becom:[3,4,6,29,27,11,12,30,31],getpolygondata:5,"try":[22,3,6,8,30,24],artifici:[30,11,8],nx_dyn_frict_sc:27,remov:[22,5,25,7,8,28,30,17],materialindic:5,getbroadphaseregion:11,samplesubmarin:[11,6,31],indefinit:[30,29],risk:31,orthogon:[5,22],follow:[2,3,4,5,6,13,8,9,10,14,15,26,28,30,31,0,7,11,12,16,18,20,22,23,29,27,32],setsubstepcount:3,group:[20,4,29,27,11,26],straightforward:[3,23,21],vertic:[20,3,5,8,30,15],sleep:25,engin:[19,21,22,8,9,11,30,27,17],eptr:2,setanaloginput:3,swap:[18,26],pxsetphysxgpuloadhook:10,pxsetphysxcommondelayloadhook:10,userraycasttouchbuff:[3,4],hardwar:[30,6],pdf:3,e1st_from_front_right:4,dofstart:28,mgearuppress:3,osx:[19,2],quicker:3,ecapsul:5,setmaxdist:22,quitissignal:4,ani:[19,20,21,22,32,4,5,28,8,9,11,30,27,15,29,17,31],free:[18,19,22,3,8,10,28,30],interfac:[18,27,21,2,22,4,25,7,26,9,10,28,12,13,14,16,17],consum:31,anatom:[28,22],pxvehiclemetadataobject:2,binarymetadata:12,d3dmatrix:7,anb:30,setradiu:8,place:[19,20,21,1,22,4,28,7,8,11,12,30,31,29,17],pxdefaultfileinputdata:[4,12],fix:[5,25,8,28,30,15,17],wait:[18,4,6],eremoved_shape_oth:11,unless:[18,22,3,13,29,11,12,31],bodyaworldoffset:22,inerti:27,write:[18,2,3,4,6,8,9,10,12,30],etank_wheel_3rd_from_front_left:4,pxcreatedynam:12,nbbodi:29,eface_index:[20,13],reproduc:[32,12],smaller:[3,4,5,8,30,31,26],mypvdclient:0,edrive_en:22,sethalfheight:8,edouble_sid:20,unlik:[20,21,29,27,28,15,31],exempt:11,skeleton:27,nor:[27,32,22,8],mmaxhandbraketorqu:3,unbatch:26,presiz:[18,30],snippetcustomjoint:22,isparticlefluid:13,asynchron:[30,25,17,22],littl:[3,30,29,6,8],api:25,allocateobject:2,pxvehiclewheeldata:17,friction:[25,31,17,8],me485:3,pxvehicledrive4wwheelord:[3,4],addaggreg:11,manual:[0,22,3,6,13,8,10,11,12,29,26],nqueri:4,horizont:[3,8],valid:[20,3,4,5,8,11,12,29,30,26],pxcreaterepxobject:2,convexshap:11,breakag:17,wakeup:[18,4,16,29],occasion:[6,26],facilit:[28,12,26],triangles:20,numtrisperleaf:[30,5,16],evolut:8,ackermann:17,"\u03c0":22,dotouch:4,pxboxcontrol:8,ecommit_disabled_build_dis:26,helper:[20,3,5,6,8,27,11,12,29,30,31,26],sound:[3,8],eprefilt:17,suppress:[4,20,29,22,26],pxscenequeryhittyp:4,differ:25,max_num_surface_typ:3,quantit:24,rai:[27,20,21,31,26],deepli:[30,29],physxcommon:[17,7],sanitybound:30,kilogram:3,quantiz:25,physxmetadata:2,bodyatoworld:22,rendercapsuleactor:5,studio:[19,16,1,7],wheelmateri:3,remaind:3,pxerrorcallback:[27,7],freed:[3,18],unambigu:3,nearest:26,pxserializ:4,legaci:[3,13,8,15,16,31],spec:30,written:[27,28,32,2,26],worth:3,inread:2,releaseexclusiveshap:13,render:[19,5,25,7,8,14,30,17],pxdefaultallocatorcallback:7,aconvexshap:5,occupi:15,link:[3,5,28,9,10,11,12,31,16],edisable_grav:29,gdispatch:15,lock:[25,28,17],pxparticlebas:4,aboxactor:5,start:[18,19,20,2,22,3,4,5,6,28,13,8,9,10,11,12,29,30,27,31,26],addlocalforceatlocalpo:29,inbound:28,edisable_preprocess:22,belt:[31,29],signal:[18,4],gnbqueryhitsperwheel:3,size:[0,18,20,2,22,3,4,5,7,26,9,10,12,14,30,27,15,8,31],submit:[18,4,27,6],geom:[3,20,5],createtrianglemesh:5,snippet:[1,5,25,28,30,15,31],kepler:15,els:[3,4,29,27,26],validateconvexmesh:5,mydellisten:4,consol:[6,12,7],pxvisualdebuggerext:[18,4],getsiz:5,pxvehicledrivablesurfacetyp:3,overwrit:6,pose:[20,22,5,25,8,28,15,17,31],dissip:[3,31],margin:[3,20],doe:[18,16,22,4,5,6,28,7,8,11,21,12,13,14,29,30,15,26,17,31],rough:[3,29],registercustomclassrepxseri:2,eenable_activetransform:16,model:[20,22,25,7,8,28,30,29,17],querycli:4,later:[3,4,5,28,8,11,29,15,26],pxdefaultpvdfiletransportcr:0,anymor:[16,4,26,8,12,29],setlimit:[18,30,28,22],deltaangularveloc:[31,29],pxtriangl:20,pose1:[20,29],pose0:[20,29],pose2:20,getfaceindex1:31,getfaceindex0:31,significantli:[0,22,3,4,5,26,11,12,30,29],setlocalpos:[3,4,5,30],isreleas:13,einequ:22,maxoverlapsperexecut:4,pxdefaultpvdsockettransportcr:[0,10],pxsweepbuffern:4,enableshapeinscenequerytest:11,pxgdynamicsmemoryconfig:15,tip:8,memorybuff:2,compress:[3,4,31,5],samplenorthpolebuild:5,presenc:[30,4,6],escene_query_shap:[11,8],thank:8,arbitrarili:[20,22],pxdeletioneventflag:4,quot:3,px_max_f32:[3,30,28,22],cloud:5,doll:[6,28,12,22],veh4wactor:3,getlatslip:3,deprec:[4,17,25],nvcloth:16,pxrenderbuff:14,proce:[3,18,5,26],perspect:6,encapsul:[30,11,5,16],mspringstrength:3,empti:[2,4,13,29,11,8],pxcreatecontactconstraint:29,curv:17,snippetsplitfetchresult:18,issleep:[27,4,29],wherev:28,why:8,unnecessarili:[31,8],think:[3,8],usersweephitbuff:4,get:[0,18,16,2,23,4,5,6,13,8,27,10,11,12,29,31,26],sluggish:17,thing:[18,3,8,12,30,31],kinet:29,factor:[22,3,5,8,27,28,14,29,30,26],quarter:[31,5],getgpudispatch:[15,6],shrunken:30,invis:25,did:[26,29,27,11,31,8],instrument:[18,1,7],dot:[20,22,4,13,27,30,16],twist:[27,28,22],readextradata:2,wherea:29,unabl:3,startgearchang:3,eimpuls:29,gen:13,gettiredrivablesurfacemateri:4,formal:[3,4,12],themselv:[6,8],pxcapsuleclimbingmod:8,jointposit:28,disableshapeinscenequerytest:11,librari:[18,20,16,1,2,22,23,4,5,6,7,26,27,10,21,12,13,8],endif:[5,6],mark:[18,21,2,3,6,13,8,10,30],invert:8,box:[30,15,14,8],almost:[0,3,11,21,6],increas:[0,22,3,6,29,30,31,8],format:[18,19,3,4,5,7,12,30,31],scenequeri:17,pxphysx:4,lin0:22,lin1:22,constant:[22,3,28,8,9,11,29,26],sometim:[20,22,5,8,14,29,30,26],contactmodifycallback:31,"05_01":3,metr:3,mix:[6,5,1],pxarticul:[9,4,28,16],inaccuraci:22,slight:[3,30,28,8],mradiu:3,overload:[4,7],alwai:[20,16,1,22,3,4,6,13,8,27,10,11,12,29,30,31,26],mfrontleftrightsplit:3,temporarili:[3,27],pxcreatejointconstraint:29,meta:12,autom:3,modifi:[20,22,3,4,5,6,7,8,28,13,29,30,15,26,31],mnbthread:6,min:28,"long":[3,4,6,7,8,12,26],naturalfrequ:3,coulomb:29,getgrav:4,steep:[17,8],fiber:4,initi:[5,7,8,28,30,15,31],line:[18,20,16,1,22,3,4,5,6,7,26,11,12,14,29,30,8],know:[20,3,5,28,8,27,11],restart:20,whenev:[9,11,12,31,26],touch:[20,4,5,25,8,11,30,31,29,17],onccdcontactmodifi:31,closer:[23,26],dominancegroup:29,mcamberatmaxdroop:3,updateobstacl:8,fatalerror:[10,6,29],pxbase:[2,22,4,13,9,12,7],pxcreatecudacontextmanag:[15,6],vast:14,pxsweepqueryresult:3,highli:[21,22],cb2w:28,pxtypeinfo:2,again:[18,3,5,28,13,8,10,11,12,14,30,31,29],proxim:[31,21],linkid:28,hold:[20,22,4,6,8,27,12,31,29],hole:[31,5],newton:3,pointfromuv:20,nx_adaptive_forc:27,pxobstacl:8,lower:[3,28,5,31,22],lend:[27,31],setdrivetorqu:3,maxim:25,pxagain:26,setdigitalsteerleft:3,player:[3,30,31,8],prompt:[19,16,1],px_max_real:29,section:[18,3,4,5,6,28,7,8,27,11,12,29,30,26],stai:[27,11,21,29],createcollectionfrombinari:[4,12],ediff_type_open_rearwd:3,pxtrianglemeshcookingresult:5,pxcontactrecord:29,unregistercustomclassbinaryseri:2,unregist:2,px_def_bin_metadata_base_class:2,leg:[22,8],critic:[3,6,12],damag:[3,30,31,32],wheeltorqu:3,settwistlimiten:28,defaultmateri:[3,11],asleep:[3,29],setseparationtoler:28,echannel_wheel_omega:[3,4],holder:32,numshap:11,stiffen:22,plug:12,jth:5,declar:[4,11,22],setchildpos:28,jointconnectionhandl:0,getbehaviorflag:8,maxit:20,cluster:[11,6,26],codepath:8,snippetdelayloadhook:10,strut:3,featur:[21,22,5,25,8,28,30,15,17,31],singular:22,efast_inertia_comput:5,releas:[31,25,7],msubmarineactor:[11,31],domin:[4,17,25,22],fetchcollis:18,occur:[22,3,6,29,28,30,31,26],susplinedir:4,buffer:[28,31],ebend:4,cpu:[21,6,7,29,11,30,15],ejoint_fram:22,cpp:[19,5,2],fread:12,suggest:[3,5],pxvehicleautogeneratedmetadataobject:2,pertain:10,resetfilt:[4,5,29],samplebridgesset:8,subtyp:7,post:[9,18,22,7,26],"true":[0,18,20,1,22,3,4,5,6,28,26,9,10,11,12,29,27,15,8,31],msize:2,getgeometri:[27,5],"50th":30,pxinputdata:[4,12],halfsideext:8,pxregisterlegacyheightfield:13,prematur:12,computegeneralizedgravityforc:28,ezero_area_test_fail:5,rang:[20,21,2,22,3,4,5,26,27,28,30,31,29],setdrivetyp:28,past:[30,8,26],destin:12,oncontact:[18,11,6,31],overview:25,pxaggreg:11,scenequeriesupd:26,pxqueryfiltercallback:[27,4,20,13,26],transpar:[4,20],runonspu:4,body0vel:22,pxconvexflag:[30,15,5],getdistancelimit:16,thi:[20,1,2,5,25,7,8,28,14,30,15,17,31],slerpdriv:22,gtreasureactor:11,waitforquit:4,amount:[18,27,20,22,3,5,8,9,11,14,30,15,29,24,31],plai:[3,11,29,22,8],meter:[3,7,29],getglobalpos:[4,5,7,29],pxcook:[4,12,13,7,26],ccdmaxpass:31,frame:[21,22,3,4,5,28,8,9,11,12,29,30,15,26,24,31],reconsid:29,decoupl:[3,27,30],magenta:8,task:[27,25,17,30],offer:[18,19,3,28,26,27,23,11,12,14,30,29,24],px_min_heightfield_y_scal:30,samplenorthpol:[5,8],setu32collisionflag:27,getrenderbuff:[27,14],px_binary_serial_vers:12,notifi:[18,11],slower:[3,29,7,8],acut:30,geometricerror:22,acapsuleactor:[27,5],mnbactor:12,suppresstrianglemeshremapt:[30,5],getverticesformodif:5,addactor:[3,4,5,29,27,11,26],share:[28,7],eswept_integration_linear:4,setsteerangl:3,nbcontactdatablock:[18,30],addcollisionspher:4,mstepsiz:[9,27],kinemat:[28,25,31],everywher:[10,8],thu:[20,5,8,11,14,29,30,31,26],capsul:30,detachshap:[11,29],srcasset:12,pxu16:[3,5,22],properti:[2,22,25,28,27,11,14,15,17,31],pxcreatephys:[0,18,5,13,27,10,7],bullet:[20,26],combo:4,linvel:29,convexcollect:12,often:[0,18,22,3,5,6,28,7,8,11,29,30,31,26],swingdriv:22,swing:[3,28,22],illustr:[0,18,20,19,3,4,5,8,12,29],updatez:18,divid:[3,4,5],nxscene:27,taken:[22,3,6,29,15,8],"while":[18,19,21,22,3,4,5,6,28,8,9,11,12,14,29,30,27,31,26,24],treat:[20,22,3,29,31,26],updateb:18,updatea:18,pxrigidactorext:[3,5,28,7,26,27,11,12,13,29,30,16],graphsizex:3,enough:[3,11,8],autogener:2,num_vehicl:3,snippetjoint:22,pxdebuglin:14,nx_timestep_fix:27,approxim:[20,3,4,5,7,29,30,15,31],poor:[3,31,8],observ:[3,4,5,18,7],tire:[4,17],createbvhstructur:26,pxbatchqueri:[3,4,26],exhaust:[3,21],undrivable_surfac:3,mlatstiffx:3,handbrak:3,pxvehicledrivesimdatanw:3,sqresult:3,carefulli:[27,4,8],pxcontrol:8,cheap:11,anmim:18,energi:[21,22,3,29,27,28,31],simfilterdata:3,pool:[18,30,6],onobstaclehit:8,filtermask:11,explicitli:[20,4,6,29,27,28,12,14,16],submittask:6,allevi:30,setupsimdata:3,articulationlink:28,algorithm:[31,25],stick:[3,8],rpm:3,suddenli:[30,24,8],gvehiclescenequerydata:3,pxarticualtionlink:9,posesweptagainst:20,upright:[5,29],camera:[3,19],separationdist:31,pxgaussmaplimit:13,second:[19,20,22,3,6,8,9,11,12,30,27,31,29],tort:32,lean:3,three:[18,21,2,22,3,5,6,8,11,12,29,30,31,26],make:[2,3,4,5,6,13,8,9,10,14,15,26,28,30,31,0,7,11,12,18,20,21,22,29,27],formerli:[27,4],submitbatch:4,hitcount:20,conform:[3,6],setlinearveloc:[18,4,29],pxconstraintvisu:22,chassismateri:3,unintuit:22,pxscenequeryreport:4,pxheightfieldsampl:[30,5],syntax:27,"abstract":[4,6,7],pxshare:[17,13],hash_map:27,plagu:8,nbconstraint:29,drivetargetveloc:28,pxvehiclepostupd:3,simul:[28,25,31,7],solv:[22,3,4,8,28,29,15,16],alreadi:[0,19,22,3,5,6,8,11,12,30,26],correctli:[0,3,29,6,8],pxvisualdebuggerflag:4,mfronteftrightsplit:3,multithread:[25,21,17],pxsweephit:[4,20,26],sole:[3,4,29],major:[15,5,28,7,21],pxregisterheightfield:[10,5,13],tank:17,find:[20,1,2,5,7,26,11,12,30,31,29],fine:[3,4,30,8],const_cast:3,accessor:4,queue:[30,6],environ:31,materi:[5,25,28,30,31,32,17],linkag:3,pxoverlaphit:26,flushsimul:[18,30,29],trace:20,whole:[29,11,8,12,26],univers:22,createcustomclass:2,numer:[3,27,20,30,8],snippetgpuloadhook:10,inclin:3,cooker:[4,5],pxclientbehaviorflag:4,echannel_normalised_tireload:3,"16k":[30,17],pxmeshpreprocessingflag:[30,5,13],edeprecated_trigger_trigger_report:17,eaccel_control:4,physxvehicl:[4,1,2],pxsolvertyp:29,respect:[21,1,22,4,5,6,13,26,28,7,30,15,29,31],motor:[28,22],edeleted_shape_0:13,edeleted_shape_1:13,trigonometri:3,fire:[19,6],e9th_from_front_right:4,pxheightfieldformat:[30,5],handleoverflow:4,treasureshap:11,nxusernotifi:27,enableshapeincontacttest:11,theme:3,chassisshap:3,devic:15,number:[18,19,20,21,22,3,4,5,6,28,7,8,27,11,12,29,30,15,26,24,31],echannel_steer_control:3,temp:15,durat:3,mmoi:3,eengine_drive_torqu:4,fopen:12,pxbatchquerymemori:[4,26],getsimulationfilterdata:4,acapsuleshap:[27,5],consequenti:32,highest:[3,22],mcamberstiffnessperunitgrav:[3,4],tilt:29,slerp:22,removed_shape_oth:13,snippetprunerseri:26,eresolve_contact:4,samplesm:5,edg:[22,3,4,5,8,30,31],trimesh:12,aheightfield:5,worldvertex:30,highlimit:28,pxvehicleupdatemod:3,brief:[25,15,17],testccdfiltershad:31,wheelshap:3,sprung:3,ahead:28,glmatrixmod:7,px_delet:4,jounc:3,createconnect:[18,4],basetask:17,mention:[3,30,15,13,29],reinforc:21,setdigitalaccel:3,geometri:[30,25,31,1,7],clearli:[3,18],setclutchdata:3,pxdefaultalloc:10,memory128:12,maxiter:28,precomput:[30,5,26],serializecollectiontobinari:[4,12],postfilt:[4,26],tunnel:[31,5,16,8],files:12,enforc:[3,4,28,22],toggl:[0,3,1],html:[3,21],wheelsimfilterdata:3,youtub:3,physxgpu_x64_renam:10,adopt:3,pxd6drive:22,physx_64:16,aconvexmesh:5,rudimentari:21,sqrt:[3,5],articul:[31,25],pxserialobjectref:4,snrepxserializerimpl:2,moment:[3,31,22,29],part:[1,25,7,28,30,31],physxcommon_:1,radial:29,typenam:[18,7],oscil:[3,22],newjointposit:28,torqu:[25,28,17,22],pxconstraintproject:22,newsprungmass:3,error:25,"01f":[3,20,22],emodify_contact:31,suffici:[20,3,7,26,27,28,12,30,15,29,31],didn:12,effect:[27,20,16,22,3,4,5,13,8,9,29,30,31,26],constructor:[2,3,4,7,26,27],ramp:8,destructor:[12,22],pxpvdsdk:16,getnblin:14,slopelimit:8,geterrorhandl:0,fast:[1,3,26,5,8,27,31,16],torsion:3,creat:[31,25,7],e3rd_from_front_left:4,deleg:7,undefin:[20,22,3,4,5,8,28,12,26],setactorflag:29,minvelocityit:29,term:[22,3,5,28,30,31],px_max_flt:22,exceed:[20,28,22],turn:[18,21,1,6,29,11,15,17,31],delet:[18,22,3,4,6,8,27,11,30,26],depenetr:[30,20],removeloopjoint:28,eignore_initial_overlap:20,snippetconvert:17,bugfix:12,reloc:8,physxfoundation_x64_renam:10,px_max_num_wheel:4,nonwalkablemod:8,eerror:28,enorm_tire_long_forc:4,minimum:[18,20,21,22,3,26,27,29,30,16],gettimestamp:5,translat:[20,22,23,29,27,28,12,30],fricpair:4,tell:[3,11,5,31,26],specul:25,pximmediatemod:29,e3rd_from_front_right:4,outweigh:6,transform:[27,20,22,3,4,5,7,26,9,28,29,30,15,16],dof:[28,22],materialindex0:[30,5],materialindex1:[30,5],doc:3,length:[20,3,4,5,7,8,27,26],small:[20,22,3,5,6,8,9,30,31,29],px1dconstraint:22,solvertyp:[4,29],via:[18,20,22,3,4,6,7,26,27,14],gl_modelview:7,pxlightcputask:[4,6],pxconstraintsolverprep:22,vehicle18w:3,pedal:3,mimic:3,"_lignv3ptiq":3,generate_project:[16,1],filetoobject:2,cross:[29,8],hello:19,prepar:2,chassismass:3,commerci:8,swaptolowlodvers:3,samples1:5,transmit:0,top:[19,3,5,7,8,10,30],myrenderobject:29,implement:[18,20,16,2,22,4,5,28,7,8,27,10,11,12,13,30,15,26,17,31],trigger:[25,7],physxapi:[18,21],explanatori:18,toe:3,isrigidstat:13,let:[21,22,3,5,8,27,11,30,31,29],without:[19,16,2,22,3,4,5,6,28,13,8,10,11,21,12,30,31,26,32],blocking_sweep:3,too:[1,5,7,8,9,11,12,14,31,26,17],owner:[32,2],mode:[30,25,31,1],pxvec4:7,nv_use_static_wincrt:1,physapi:18,toi:[31,29],kei:[3,18,28,27],ecallback:11,mayb:[30,24,8],sourcewheelid:3,directori:[19,21,1,2,13,27,16],dictat:8,aheightfieldactor:5,suppli:[3,28,5,12,22],scroll:3,pxmax:30,getanyhit:4,categori:3,pxcontrollerdesc:8,aris:[3,23,11,32,29],instanc:[18,20,2,22,3,4,5,6,26,27,10,11,12,14,30,24],ride:[3,8],truck:3,pxrepxinstantiationarg:2,transfer:0,alpha:3,linearli:3,snippetvehiclescal:17,interchang:[5,13],sethalfforwardext:8,build:[28,25,7],cpudispatch:[15,17],jointdesc:29,"16m":30,tirelongforcemag:3,numvehicl:4,otherwis:[32,1,22,3,4,5,8,11,31,26],sweepdirect:26,gdelayloadhook:10,pxcontactstreamiter:[4,31,13],broadli:26,built:[0,19,21,1,26,25,8,12,30,31,16,17],mytelemetrydata:3,cudamanag:16,dstmetadata:12,satisfi:[3,12,22,29],pathtonewrepxfil:12,pxvehiclekeysmoothingdata:3,cmake:25,ifdef:5,normalisedtireload:3,tireid:3,watertight:31,kept:[20,29,22,8],space:[20,21,22,3,4,5,28,7,8,27,11,29,30,26],split:[25,15,5,17,22],reconcil:6,neutral:3,gettireloadfilterdata:3,esend_sleep_notifi:[18,29],huge:5,pxqueryhit:[4,13],stall:6,intersect:[20,3,5,26,27,11],linecolorlow:3,ejoint_limit:22,atrianglemesh:5,separ:[0,20,22,3,4,6,13,8,27,28,12,29,30,31,26],stale:26,github:21,getconcretetypenam:[2,7],disjoint:30,repxextens:4,gettwist:16,convexdescpolygon:5,process:[20,2,22,4,5,28,7,29,9,11,30,15,17,31],imposs:3,reson:29,e2nd_from_front_left:4,pxsimulationeventcallback:[18,22,6,13,29,27,11,31],some:[2,3,4,5,6,8,10,14,15,26,28,30,31,1,7,11,12,24,18,20,21,22,23,29,27],lambda:28,"3333f":3,sync:[4,8,30,15,26,31],miss:[3,27,31,2,30],myconvexmeshshap:5,hfscale:5,nbactiveactor:29,forinternalus:22,bounc:[18,22,3,5,8,31],bound:31,incident:32,addit:[1,2,5,25,7,8,28,30,15,31],eanalog_input_steer_left:[3,4],pxcollectforexportscen:4,edeprecated_body_joint_group:16,validatetrianglemesh:5,hovercraft:3,overlapobbshap:27,exclud:[29,26],implicitli:[4,12,22],addid:12,etank_wheel_8th_from_front_right:4,gravit:[3,4,29],uncaught:30,reliabl:[3,13],pad:3,opengl:7,isinair:[3,4],front:[3,20,7],pxsolverconstraintdesc:29,suspens:3,deform:25,thought:[3,30],hadhit:4,pxcapsulegeometri:[27,5],nbsolverbodi:29,begin:[3,18,31,5,30],physxextens:[2,22,3,4,13,8,12],pyramid:[5,8],immov:[28,22],bb2w:22,taskmanag:[27,25,17],aim:[3,30],pxmetadata:2,pxsolveconstraint:29,glpushmatrix:7,theorem:31,pxscenequeryext:13,aggress:3,"60th":30,organ:4,index:[20,2,5,25,8,28,30,17],gettetheranchor:4,comput:[28,25,31],attachshap:[11,12,7],relax:3,exclus:[2,22,6,12,30,16],convexgeom:11,decis:[3,30],bumpi:3,heightfielddesc:5,pullei:22,theoret:3,iter:[20,22,5,25,28,13,11,30,31,17],angularimpuls:31,createzero:4,opportun:3,come:[21,1,3,6,29,11,30,31],pxraycastcallback:[4,26],linearcapsulesweep:27,setrestoffset:[27,31,5,16],pxqueryfilt:4,said:[3,11,5,12,29],contactcorrelationdist:4,pxbvh34midphasedesc:[30,5,16],perpendicular:28,intervent:8,game:[18,19,21,3,6,28,8,23,11,12,30,31],evolv:21,repxserializerimpl:2,item:18,setheight:8,vert:[30,5],setposit:8,gui:0,homogen:7,mbp:[11,15],renderobject:29,meshcontactmargin:29,guarante:[20,3,5,6,26,9,28,12,30,31,29],"28tank":3,unpackjointdata:28,circumst:[30,31,29],mwidth:3,aabbtre:26,experiment:3,mostli:3,velocitytarget:22,inconsist:[30,6,22],hazard:6,getnbanyhit:4,queri:[25,7],createvehicleactor12w:3,pxvehicletireloadfilterdata:17,equival:[20,22,3,4,5,28,27,11],getsteer:[3,4],accuraci:[3,30,28,29,8],doanyhit:4,style:[27,31,22,29],pxvehiclecopydynamicsmap:3,pxps3configparam:4,search:[3,20,5,26],egear_ratio:4,order:[18,20,16,2,22,4,5,6,28,7,8,27,10,11,12,13,29,30,31,26],clearforc:[18,4],broadphasetyp:[30,15],pxscenedesc:[18,4,6,13,26,27,11,7,29,30,15,16,31],accord:[21,22,6,29,31,26],upgradecollect:4,stabl:[22,3,7,28,30,31],optimum:3,evelocti:28,pvdsdk:0,choreograph:3,taskid:6,disableshapeincontacttest:11,setnam:12,instantiate4wvers:3,gpu:25,serial:[25,7],twitchi:[3,22],baseid:12,pxbatchqueryprefiltershad:26,pxvehicleclutchdata:17,setminccdadvancecoeffici:31,rigid:[28,25,31,7],suspdata:3,like:[18,27,2,22,3,4,5,28,7,26,9,23,11,12,29,30,31,8],setmassandpreservenaturalfrequ:3,convexedgethreshold:5,statickinefilteringmod:[31,29],pxvehiclenw:3,finalizequeri:26,unus:[18,10,2,30],regener:8,manoeuvr:3,"33m":30,flexibl:[4,11,12,29],pxfiltershad:22,readi:[3,6,29],"20kg":3,less:[20,22,3,5,6,8,12,30,15,29,24,31],strict:[3,32],pxu32:[18,20,2,22,3,4,5,6,28,8,27,10,11,12,14,29,30,15,26,31],geomsweptagainst:20,nbcontactpair:18,discover:6,px_new_serializer_adapt:2,ememori:[0,18],gravitationalacceler:3,exact:[20,3,8,27,28,12,29,30,15,26,31],setsuspensiondata:3,merg:[4,16,17],reinterpret_cast:[31,22],option:[19,20,21,2,22,4,5,25,28,13,8,11,12,14,29,30,15,26,17,31],pxarticulationbas:16,pack:28,onadv:13,fulfil:[3,28],setanalogbrak:3,reportassertviol:27,memori:25,quaternion:[27,28,5,22,7],nxjointdrivedesc:27,stretch:[30,4,5],constantblock:[22,3,4,26,27,11,31],getinternalfaceindic:31,pxvehiclesuspensiondata:17,becaus:[18,21,1,22,3,5,6,28,7,8,27,11,12,29,30,15,16,31],centreofmass:3,computelambda:28,getworldbound:[4,20],pxscenequeryfilt:4,init:27,myphys:3,echannel_brake_control:[3,4],phors22:3,pxcreatebatchqueri:4,verifi:[5,2],phors21:3,buildgrbdata:[30,15],pxvehicledrivetanksmoothdigitalrawinputsandsetanaloginput:3,linecolorhigh:3,slider:22,fewer:[31,26],erear_left:[3,4],triplet:5,charact:[28,25],"byte":[18,27,12,7,30],maco:12,cmutil:2,pxcomputetrianglemeshpenetr:[20,16],align:[18,20,22,3,7,8,27,11,12,30],artifact:[3,30,22],setmetadata:12,histori:[3,12],pxu8:[5,2],expand:[3,5,8],other:[2,4,5,13,8,14,15,26,28,30,31,1,7,11,17,19,20,21,22,25,29,27,32],directx:7,px_buildsnippet:1,eanalog_input_steer_right:[3,4],stand:[21,3,5,8,11,12,30],mminfilterednormalisedload:3,twenti:9,exemplari:32,createarticul:28,skin:[17,8],resttireload:3,phase:[28,25],pressur:26,pxheightfield:[5,7],faceindex:[20,26],ca2w:28,camberangleatrest:3,getobject:12,unpack:[28,5,21],drivetyp:27,onwak:[18,6,29],pxpruningstructuretyp:[25,17,13],street:[3,8],local:[20,22,3,4,5,13,26,27,11,14,29,30,31,16],getsleepthreshold:29,pxcontactpairextradataiter:31,denot:[6,26],getsimulationstatist:24,galloc:26,millikenresearch:3,locat:[18,3,7,8,23,28,12,30,31,29],foot:8,emine_head:11,seem:[3,23,22],seen:[3,18,30],px_foundation_vers:[16,13],pxbatchconstraint:29,pxvehicletelemetrydata:[3,4],accur:[20,21,22,3,4,8,28,31,29],pxactorclientbehaviorflag:4,activegroup:26,getdata:5,suspjounc:4,setdigitalhandbrak:3,persistentcach:26,surfacemateri:3,badli:[3,24],analogv:3,strcmp:2,vanishingli:3,skid:3,first:[20,2,22,5,25,28,7,8,27,11,14,29,30,31,26,17],width:17,px1d:22,xmlreader:2,setrevolutejointflag:22,mspringdamperr:3,been:[18,20,16,22,3,4,5,6,28,13,8,27,10,11,12,29,30,31,26,24],ecct_can_ride_on_object:8,etransmit_contact:[0,4],opposit:[3,20],mcentrebia:3,after:[0,18,20,22,3,4,6,28,8,9,10,11,12,14,29,30,27,31,26,24],alloc:[28,7],aheightfieldshap:5,straighten:3,cook:25,cuda:[15,21,6],budget:3,six:[3,22],allow:[2,3,4,5,6,13,8,9,26,28,30,31,0,7,11,12,24,18,19,22,29,27],getnb:2,strip:[10,5],complianc:28,getdefaultmateri:[4,5],refcount:6,onconstraintreleas:22,minim:[18,20,1,22,26,6,8,11,29,30,16],sit:3,ihit:4,trick:3,echannel_norm_tire_long_forc:[3,4],desc:[4,8,27,30,31,26],e1st_from_front_left:4,conveni:[0,20,4,5,8,11,12,29],overridden:7,e5th_from_front_left:4,handl:[0,3,4,5,6,28,8,27,11,29,16],getnbobject:12,determinist:[30,7,29],repeatedli:20,pxsetasserthandl:7,physxfound:17,px_new_repx_seri:2,wish:[3,10,29],rearrang:3,earth:29,gettirelateraldir:[3,4],mfrontwidth:3,convent:17,pxparticlesystem:27,probabl:[3,9,11,8],pxvehicledrivetanksmoothanalograwinputsandsetanaloginput:3,diagon:[27,5,29],loop:[30,25,28,17],exercis:3,put:[1,3,4,5,6,28,8,11,31,26],etg:29,settiredata:3,gettirelongitudinaldir:[3,4],etc:[18,19,21,1,22,3,4,5,28,13,8,11,30,15,29,24,31],pxrigiddynamicactor:3,settim:27,content:[19,25,6,8,10,12,30,17],simpler:[3,27,31,6],root:[19,28,1,2,22],anim:[9,18,8],schedul:[3,27,6,30],possibl:[3,4,5,6,8,9,10,15,26,28,30,31,1,11,12,24,18,20,22,29,27,32],overrid:[0,27,29,26],successfulli:[0,3,12,18,4],nblink:28,pxgeometryhold:25,involv:[27,22,4,26,9,11,12,30,15,29,17,31],getphysxcommondllnam:[10,16],look:[18,27,2,22,3,8,9,11,30,26,24],doublesid:20,pxconstraintalloc:29,nxuserraycastreport:27,predefin:[10,31,5],preset:25,cylind:22,troubl:[23,30,31,8],sticki:31,gtreasurefound:11,htm:3,print:7,createvehicleactor:3,visual:[31,25,7],pxfilterflag:[3,4,11,27,31],pxsolverconstraintprepdesc:29,drivetorqu:3,shape:[25,7],prefiltershad:[3,26],indices32:5,minor:[4,7,8],label:3,setupdirect:8,efast:4,starv:6,px_c_export:[3,29],trepxid:4,echannel_tire_long_slip:[3,4],vehdrive4w:3,stride:[30,5,26],eenable_speculative_ccd:31,etank_wheel_1st_from_front_right:4,pxtask:[4,16],"3x3":[27,29],latslip:3,oncontrollerhit:8,color:3,processtouch:[27,26],config:[4,16],vehicleupdatemod:3,gbatchqueri:3,mcpudispatch:6,msvc:7,myactor:[27,11,5,7],angvel:29,detach:[0,11,7],rise:[3,11],yangl:28,triangul:5,unknown:[27,8],echannel_tire_lat_slip:[3,4],tempor:[25,21,17,8],custompath:10,outstream:12,pxclothfabrictyp:4,createbatch:4,xmlwriter:2,pxtriggerpair:11,unimport:30,suspsprungmass:3,eenable_gpu_dynam:[30,15],pxvehicledrivetankcontrolmodel:4,complementari:3,track:[18,21,3,5,6,23,8,10,28,30,29],getcontrol:8,criteria:28,px_profile_start_crossthread:16,solitari:11,activeactor:29,don:3,enotify_threshold_force_lost:[31,6],pxinitvehiclesdk:[3,4],setanalogaccel:3,flex:[16,13,8],filterdata0:[3,4,11,27,31],air:[3,4],primit:[5,14],commonli:3,eprismat:28,pxscenequeryflag:[4,13],tirealignmo:3,inwrit:2,edeleted_shape_trigg:13,pxmeshcookinghint:5,sprungmass:3,tolerancelength:29,getnbloopjoint:28,pxcapsulecontrollerdesc:8,sinc:[0,18,21,22,4,5,6,7,8,9,12,13,14,29,30,27,31,26],immedi:[4,5,25,8,31,17],semant:[4,31,17],birth:22,org:3,pxtransform:[20,22,3,4,5,28,7,26,27,11,12,30,31,29],usag:25,movabl:22,internalfaceindex1:31,px_profile_stop_crossthread:16,doesn:[3,31,12,2],spread:3,incomplet:17,uniqu:[3,12,2,13,8],relat:[20,3,4,28,8,27,23,11,12,31,26],echannel_normalized_tireload:4,disappear:30,bundl:1,createrigidstat:[30,29],snippetseri:17,preclud:3,framework:[9,19],cachedindex:20,batchqueryfilterdata:26,weld:5,mixtur:[31,22],great:[30,11,5,31,8],getinboundjointdof:28,arcad:31,fresh:3,robust:[21,3,4,5,29,28,30,31,8],enumer:[2,22,3,4,7,11],epsilon:31,insert:[31,25],pxvehicledrivesimdata4w:3,captur:[11,26],profit:32,chassissimfilterdata:3,conceptu:3,pxconstraintsolvehint:22,fulli:[21,22,8,30,31,26],posit:[31,25,7],processblockinghit:4,profil:7,pxarticulationaxi:28,ejounc:[3,4],eccd_linear:4,subject:3,exportnam:12,diverg:30,concess:30,pend:[4,29],sweephitbuffers:4,nxspringanddampereffector:27,pxmin:30,localpos:30,ftell:12,collect:[19,21,2,4,5,13,8,11,29,26,17],outputflag:26,erigid_stat:[11,12],enclos:[20,5,26],made:[18,21,22,3,4,5,6,28,7,8,11,30,31,26],px_window:6,incompat:[20,12],referenc:[9,6,27,12,7],getcontactoffset:8,author:[3,10,12],pxclient:16,samplebridg:8,paper:31,obligatori:3,commun:8,d3dxmatrix:7,nonetheless:[21,8],member:[20,2,7,26,11,30,31,29,24],submarin:[11,31],setinvmassscal:[30,4],eno_block:[4,17],pxvehiclegraphtyp:4,pxvehicledrive4wsmoothanalograwinputsandsetanaloginput:[3,4],notic:[3,31,32,22],www:3,dure:[0,18,20,2,22,3,5,6,28,8,9,11,12,29,30,27,15,26,31],registri:[12,2],shut:[25,17],better:[18,22,3,4,5,6,28,8,27,11,12,30,31,26],input:[18,19,20,22,3,5,7,8,23,28,30,26],trivial:[27,11],unaccept:[27,22],signific:[18,21,3,4,5,6,13,26,27,28,14,30,16],finit:31,easili:[3,4,13,8,29,15,16],began:4,judici:3,prefix:[27,4,7],contain:[19,20,16,1,2,22,3,5,6,28,7,26,9,10,11,21,12,30,27,8,24],mdampingratezerothrottleclutchdisengag:3,ptype:2,what:[30,25,8],against:[2,5,7,8,30,15,31],ccd:25,gap:5,pxgeneratecontact:29,land:8,eany_hit:[4,17],createaggreg:11,method:[2,3,4,5,6,13,9,10,26,28,30,31,0,7,11,12,16,24,18,20,23,29,27],outorderedconstraintdesc:29,third:29,eengine_rev:4,surround:[3,8],newer:[4,16,12,13],geompos:20,stress:22,pxdominancegroup:29,emploi:[3,4,6],compound:[30,31],etank_wheel_6th_from_front_left:4,gettaskid:6,discov:3,mcmoffset:3,name:[27,21,2,32,5,7,8,9,11,13,30,16],vehicle4w:3,meshpreprocessparam:5,closest:[30,20,17,22],setoverlaprecoverymodul:8,swing1:[28,22],blow:11,pxcontactset:31,ithread:4,pxlocationhit:4,dstbinfil:12,bridg:[30,8],cascad:2,remot:[11,12,14],getconcretetyp:[4,12,7],patch:[3,6,15,12,29],gauss:[25,21,17],evenli:3,mupratio:3,steal:6,pxconvexmeshgeometri:[3,11,5,7],question:[0,3,21,27,29],triangleindexbuff:20,speedup:26,readabl:12,microsoft:[16,1],pxvehicle4w:3,support:[2,3,4,5,6,13,8,9,10,15,26,28,30,31,0,1,7,11,12,16,19,20,21,22,29,27],lastli:3,pink:3,constraint:[21,25,8,28,30,15,17,31],slice:5,pxusercontrollerhitreport:[27,8],inarg:2,setminlongslipdenomin:3,difficulti:[3,22],app:[3,9,1],meshcookinghint:[16,30,5,13],equilibrium:[31,29],pxcreatefound:[27,10,16,13],"1500kg":3,patchcount:31,download:0,onpvdconnect:0,inflex:11,mechan:[18,21,2,22,3,4,6,28,7,8,27,11,12,29,30,31,26],pxcontrollerflag:4,stabil:[18,22,3,5,28,30,31],regim:3,chosen:[3,31,22,26],tempbuffercapac:15,screen:3,pxvehicledrive4wcontrol:4,inferior:11,integr:[21,22,4,5,13,29,27,28,7,14,16,17],esimulation_shap:11,dominance1:29,dominance0:29,egraph_type_wheel:4,balloon:27,natur:[3,18,22,8],have:[3,4,5,6,13,8,9,10,14,15,26,28,30,31,7,11,12,16,24,18,20,21,22,29,27],surpris:18,about:[5,25,7,8,28,14,30,31,17],nbrow:[30,5],pxvehiclegraph:[3,4],samplesubmarinefiltershad:11,recipgrav:3,violat:[28,22],setreportallocationnam:[18,7],awkward:5,pxvehiclecompon:3,rim:3,pxarticulationmot:28,edist:[16,13,26],createobstaclecontext:8,aabb:[20,5,8,11,30,26],myvehicl:3,pxvehicledrivetankwheelord:4,nxuserentityreport:27,text:[12,14],aka:6,reli:[3,10,27,18,2],dangl:12,ereport:13,investig:24,bar:17,over:[30,28,31],updirect:[3,8],esuspforc:4,onto:[3,15,22],refit:26,isoverlap:20,endors:32,processhit:4,setinvinertiascale1:29,setinvinertiascale0:29,physxcook:[4,10],wheelraycastprefilt:3,"void":[0,18,2,22,3,4,5,6,7,26,27,11,12,29,31,8],pxquerycach:[4,13,26],view:[0,19,21,3,6,8,14],formula:[3,20],insuffici:[30,15,31],appendix:3,edebug:0,avail:[18,27,20,1,3,4,5,6,13,8,9,10,11,29,30,15,16,31],setinvmassscale0:[4,29],tighter:[3,30,5],coordin:25,setvisualizationcullingbox:14,pxvehicledifferentialnw:3,e8th_from_front_left:4,degener:26,bottl:8,tireforceshaderdata:3,com:[0,3],highend:2,attributes1:[3,4,11,27,31],attributes0:[3,4,11,27,31],averag:11,pxvehiclecomputetireforcedefault:4,mminestoexplod:31,perceiv:[9,30],wheelssimdata4w:3,incollect:2,mcamberatmaxcompress:3,ground:28,includ:[2,3,4,5,6,13,26,28,30,31,0,1,7,11,12,16,19,20,21,22,29,27,32],reorder:[5,29],mean:[18,3,4,5,6,28,8,11,12,29,30,15,26,31],dream:3,deltalinearveloc:[31,29],userflipedg:30,jitteri:[9,17],getphys:[4,11,5],setarticulationflag:28,pxcreatecook:[27,10],amort:[30,25,17],meshshap:11,ultim:29,hyperthread:11,eno_boundary_edg:5,pxdefaultfileoutputstream:[4,12],pxspheregeometri:[27,4,11,5,12],trgwheel:3,lifetim:12,userraycasthitbuff:4,medium:6,adjac:[9,5],efirst_user_extens:2,rear:3,type:25,shapepos:26,setcontactoffset:[27,30,31],pxcontactpairflag:[31,6,13],etire_lat_slip:4,drop:[0,18,22,3,5,30,15,31],pxjointconcretetyp:22,phenomenon:8,filterdata:[3,27,11,26],pxvehiclewheel:[3,4],cylindr:[3,22],unwant:12,hitinfo:20,computeinteract:8,unnotic:22,rest:[3,31,5,6,22],event:[32,22,4,13,8,27,11,30,31,16,17],euse_swept_bound:4,collision_flag_ground_against:3,eweld_vertic:5,liabil:32,pxbroadphasetyp:[30,11,16,15],encompass:21,listen:[0,18,31,29],traffic:31,localscaledvertex:30,maxraycastqueri:4,pxpairfilteringmod:30,ensur:[18,20,22,3,4,5,6,28,7,29,11,12,13,30,15,31],behaviour:13,meshdesc:[4,5],pseudo:[3,30,11],green:[6,8],entrant:6,accept:[1,3,5,29,30,31,16],acceler:[30,28,25,31],where:[2,3,4,5,6,8,14,26,28,30,31,1,7,11,12,18,20,21,22,23,29,27],destroi:[21,22,4,7,10,28],row:[22,3,5,7,29,28,30],drawwalldec:26,isol:12,throughout:[3,28],pxscenequeryupdatemod:[25,16,17],pxpvd:[0,10,13],nonphys:22,workload:6,sphere:[30,15,8],architectur:[30,6],drag:0,sensibl:3,setinvmassscale1:29,rag:[6,28,12,22],edefault:[27,11,31,20,26],getmaterialindex:27,"break":[18,22,3,4,5,7,26,28,12,31,29],nearbi:[8,26],pxconstructstaticsolverbodi:29,caus:[18,32,1,22,3,4,6,28,26,23,11,12,30,31,29,24],asset:[12,29],draw:7,upgrad:[27,16,13,21],pxvehiclesuspensionraycast:3,raw:3,registerdeletionlistenerobject:4,assign:[3,6,29,27,11,12,26],goal:[3,30,31,6],hitbuff:[4,26],aconvexactor:5,areatestepsilon:5,jointacceler:28,frictionpair:3,temporari:[18,30,15,31,26],pushback:4,induc:3,ecommit_enabled_build_en:26,incorrect:[20,29,1,22,26],yield:28,simulationeventcallback:18,detriment:30,titl:3,eproject:22,"81f":15,physic:[28,25],mestimateiter:3,exot:3,numwheel:[3,4],mplane:5,edrive4w:4,assort:27,vehiclefiltershad:3,folder:[27,19,1,2],pxcollectforexportsdk:4,inrigiddata:29,mfrontbia:3,afterthought:27,filtershad:[27,4,15,31],swim:31,nx_sta_frict_sc:27,pxshape:[20,16,3,4,5,7,8,27,11,12,29,30,31,26],collectionscen:12,ought:3,contactcach:29,pick:[3,26],multipli:[3,30,29,7,8],setmaxangularveloc:30,convexinputdata:12,streamlin:16,costli:11,threadbatch:4,partial:[22,3,6,29,15,24],signalquit:4,pxactorflag:[18,11,14,29],echannel_gear_ratio:[3,4],pxconstraintinfo:22,everythingcollect:12,menu:[0,12],stiffer:3,"30hz":30,e6th_from_front_left:4,px_generate_static_librari:1,pattern:[2,22,3,6,7,23],altitud:8,prune:[11,26],ecommit_disabled_build_en:26,meaning:[3,20],focus:3,matric:[27,4,7],mindexbas:5,gdefaulterrorcallback:10,knee:22,vehicledrivablesurfacetotirefrictionpair:3,nbcol:5,specifi:[18,20,1,2,22,3,4,5,28,7,26,27,10,11,12,30,31,29],e2nd_from_front_right:4,feel:[29,17,8],dealloc:[18,12,2,7,26],rework:[3,4],edrive_model_speci:4,getnbvertic:5,descript:[25,28,5,17],disablewheel:3,setfootposit:8,quantizedcount:5,settireloadfilterdata:3,getgeometrytyp:27,setsceneparamint:4,modifysampl:5,blue:[3,6,8],elimin:3,maxnbobjectsperregion:16,teleport:[23,11,29,8],axl:3,broad:25,axi:[20,22,4,5,25,28,8,27,11,31,17],elimit:[28,22],pxarticulationjoint:[28,16],woken:[4,29],pxjointangularlimitpair:[4,22],rcvd:3,connector:22,px_force_inlin:[3,4],anisotrop:[17,22],vehicleid:3,obei:[3,18],"\u215d":16,nxstream:27,longslip:3,echannel_engine_rev:[3,4],setdistancejointflag:22,dir:3,pxvisualdebuggerconnectionflag:18,convers:[3,4,5,12,27],trianglemesh:[10,11,5],drawback:31,sethalfsideext:8,setackermanngeometrydata:3,afterward:30,"boolean":[3,27,20,6,26],achiev:[2,22,3,4,5,6,29,27,10,11,12,30,15,31],rout:0,mfinalratio:3,notabl:20,weight:3,glmultmatrixf:7,contactpatch:[31,13],arriv:6,relationship:[22,3,29,9,28,12],insight:24,add:[1,2,22,3,4,6,28,26,11,12,30,15,31],server:0,five:3,obviou:3,our:[0,21,1,29,10,30,26],convexstream:12,trade:[11,5,28],out:[32,22,3,4,5,6,7,8,10,11,29,30,31,26],pxvehicledrivetankcontrol:4,worthi:18,setgroupcollisionflag:27,begun:6,setcontactreportthreshold:31,computation:3,rapid:[3,12,29],emax_nb_drive_channel:4,restdistacnc:31,contrast:28,startsimul:6,etire_frict:4,getboxgeometri:5,henc:6,mrearwidth:3,deep:31,never:[1,6,28,8,11,30,26,17],self:[18,11],setdrivetarget:28,amateri:5,world:[19,5,25,7,8,28,14,30,31,17],spawn:[11,8],slippag:3,ball:[19,28,22],spg:3,px_debug:7,snippetvehicle4w:17,pxtransformfromseg:5,emax_num_wheel_channel:[3,4],mwheel0:3,mwheel1:3,pxtrianglemesh:[30,5,12,7],automat:[0,18,16,2,22,19,3,4,5,6,28,26,11,12,29,31,8],pvd:[19,21,1,4,25,13,7,14,30,17],hing:[28,22],"null":[18,20,2,22,3,4,5,6,8,10,28,12,26],pound:3,coupl:[3,11,24,31,8],postfiltershad:26,hint:[22,26],symbol:27,info:[3,20,2],particular:[20,32,22,3,4,6,28,8,11,29,30,31,16],complet:[20,4,25,8,9,30,31,17],could:[20,2,3,5,28,8,27,11,12,29,31,26,24],isn:3,output:[0,19,20,1,2,22,23,5,28,7,10,11,12,16],provid:[2,7,8,28,14,30,15,31],"32bit":5,when:[28,31,7],pxhitflag:[20,3,4,13,26,30,16],solver:[21,25,8,28,30,15,17,31],establish:12,hover:3,ratio:[3,28,22,29],dat:12,complex:[0,18,21,22,19,3,8,9,30,15,29,31],simpli:[18,27,20,16,22,3,4,28,13,8,9,10,11,14,29,30,26],pxf32:[3,4,20,30,8],explicit:[21,4,6,13,27,28,16],pxserializationcontext:2,warn:[18,20,2,3,4,5,6,26,30,15,29],localfram:22,iresult:4,pxcontactjoint:28,read:[0,18,2,4,6,29,9,10,31],elsewher:27,fetchresult:[27,22,25,29,9,30,31,26,17],getwheelrotationspe:3,pxcookingparam:[5,13,10,30,15,16],getnbrow:5,clarifi:3,concis:19,chain:[22,6,28,11,12,30],tag:[27,8],hfgeom:[30,5],accomplish:3,both:[0,18,20,16,2,22,3,4,5,6,13,26,9,11,12,29,30,27,31,8],mnbconvex:12,contributor:32,select:[20,3,26,8,30,31,16],convexmesh:5,chassi:3,createshap:[27,4,17,7],commit:[7,26],underneath:3,echannel_suspforc:[3,4],path:[20,1,22,4,5,26,10,28,12,29,16],trystandup:8,eabort:7,command:[12,16,1,19],sent:[0,18,4,6,8,12,31],solverconstraintdesc:29,fifti:9,pxmodifiablecontact:31,onli:[1,2,7,8,28,14,30,15,31],customcreatephys:10,"125f":3,reader:[21,6],shareabl:[4,12,13],"27m":30,filenam:[0,18,12,2,7],fetchresultsstart:18,pxcontactpoint:[4,31],perfect:[3,31,8],platform:[19,1,2,5,7,8,27,29,30,16],enotify_touch_persist:16,collision_flag_wheel_against:3,altogeth:27,eblocking_hit:4,now:[18,27,3,4,5,6,7,8,9,10,28,13,29,30,31,16],errorstream:0,actormass:3,triangleneighbour:5,heavili:3,account:[18,3,6,29,28,30,15,8,31],yet:[27,12,22],px_serial_file_align:12,gettiredrivablesurfacecontactnorm:4,estim:[28,22],onjointbreak:22,rendermateri:[5,6],maxlesepar:3,eblock:[3,4,26],pacejka:3,statement:[31,5],mcook:10,skateboard:3,scope:[21,8],wasn:27,physx3checked_x64:16,pxjoint:[30,4,28,22,29],findoverlaptrianglemesh:20,monitor:30,mgeardownpress:3,here:[0,18,20,2,22,3,4,5,28,26,9,10,11,12,29,30,31,8],cost:[3,28,26,11,30,15,31],purgecontrol:8,uint64_t:0,ewheel:4,deepest:11,pxhalfpi:5,usererrorcallback:7,nxinitcook:27,pxmemzero:28,interpenetr:31,getnumwheelgraph:4,pxplaneequationfromtransform:5,conveyor:[31,29],uniformli:[31,29,26],point:[7,8,28,14,30,31],egearsratio_count:3,ignor:[20,22,3,4,6,7,8,11,12,13,29],uncoupl:3,pxrepxseri:[4,2],pxspatiallocationcallback:27,nbbatchhead:29,registerobserv:4,collectiona:12,"static":[27,16,1,2,22,4,5,28,8,9,10,11,21,29,30,31,26],retarget:2,tradition:8,inflat:[27,30,20,31,13],registerbinarymetadatacallback:2,deletionev:4,volumegrowth:8,pxforcemod:29,consequ:[18,3,4,6,13,8,12,29],updatest:6,writeallproperti:2,granular:6,setshapepairflag:27,quit:[3,4,29,30,31,8],createlink:28,buildtriangleadjac:[30,5],unevenli:3,contact:25,life:[3,31,29],xcm:3,edynamic_aabb_tre:17,among:[18,19,22,3,4,29],setstabilizationthreshold:30,pxvehiclepadsmoothingdata:3,hitactor:4,gpudispatch:[15,6],lifo:6,pxdumpmetadata:4,activ:[22,4,25,8,27,30,15,16,17,31],joint:[31,25],recent:[30,5,29],tcp:0,elock_linear_z:29,dst:31,px_delete_serializer_adapt:2,swaptohighlowvers:3,assumpt:[3,31],pxcontactpairheaderflag:[6,13],tall:31,driven:[3,28,22,8],echannel_tire_frict:[3,4],pxjointlinearlimitpair:4,edrivetank:4,artefact:[30,31],maccumul:9,collisionflag:8,bat:2,ecct_user_defined_rid:8,tirelateraldir:4,gpudynamicsconfig:15,directli:[16,22,3,4,5,6,7,8,27,10,28,12,14,29,30,31,26,24],pxfilterobjecttyp:11,note:[2,3,4,5,6,8,10,14,26,28,30,31,1,7,11,12,18,20,21,22,23,29,27],gram:3,imagin:[3,30],soft:31,recycl:31,efront_left:[3,4],setinverseinertiascale0:4,noth:[13,29],coincid:22,geometryqueri:26,toler:[22,3,5,7,29,10],pxconstraintdomin:4,sure:[2,3,4,5,8,11,14,30],minimpuls:22,stutter:31,setcmasslocalpos:[3,4,22],shall:[3,18,32,6],"60fp":30,eremove_unreferenced_vertic:13,heightfieldpos:20,pxsetphysxcookingdelayloadhook:10,dimension:[21,29],depict:3,secur:17,indexbuff:5,rigidbodymass:3,pxpruningstructur:[30,25,17,13],exactli:[3,28,29,31,8],mindist:8,setinvinertiascal:30,simpl:[19,20,21,22,3,4,5,28,7,8,11,12,29,30,31,26],px_new:4,isblock:4,tool:[16,2,14,21],amaterialarrai:5,stop:[22,3,5,8,27,31],fly:[1,8],simplifi:[4,26,9,28,12,30],tricount:5,esuppress:[3,27,11],basic:[2,25,7,8,14,30],sln:19,pxsimulationfiltercallback:[27,11],khz:3,pxconcretetyp:[12,2],setsleepthreshold:[27,29],dynam:[31,25],pxclothphasesolverconfig:4,anti:17,pxmeshscal:5,pxvisualizationparamet:[16,22,14],emax_nb_wheel_channel:4,computecoefficientmatrix:28,setwheelcentreoffset:3,purposefulli:3,properli:[3,8,11,12,29,30,26,24],smooth:[3,9,31,5,22],linkgeometri:28,beyond:[22,28,11,30,16,17],proport:[21,22,3,28,15,31],shoulder:22,pxvehicleenginedata:17,mtorquecurv:3,snippetloadcollect:17,gettirelongslip:[3,4],pxerrorcod:[4,7],pxprofilercallback:0,smoothli:[3,31,22,8],translatepxbas:2,wheelmap:3,nbpolygon:5,corrupt:6,"0x7fff":30,fuso:3,restdist:31,eremoved_shape_1:13,eremoved_shape_0:[6,13],characterist:[3,30,22,26],analyz:[18,29],manufactur:3,walk:8,overli:3,edeleted_actor_1:13,interv:[9,23],futur:[3,4,28,13,29,11,8],illeg:[18,6],raycast:25,analyt:[3,28],"60hz":3,apexscen:6,lookup:[3,27],getsuspensiondata:3,surprisingli:30,pxzero:[4,22,7],etank_wheel_6th_from_front_right:4,eremove_duplicated_triangl:13,corner:[30,20,17,8],contigu:3,fpu:[7,8],arc:22,createrigiddynam:[3,4,11,5,29],hood:8,chapter:[20,22,6,7,29,9,10,30,24],computepenetr:20,freedom:[3,28,22,29],etrigger_default:[27,4,11,31],linear1:22,well:[28,31],knowledg:[3,28],pxwheelqueryresult:[3,4],cambertireforc:3,pxserialframework:2,divert:3,createconstraint:22,even:[18,20,21,22,3,4,6,28,7,8,27,23,11,12,14,29,30,31,26,32],sample_new:5,pxvehicledrive4wrawinputdata:3,your:[0,16,1,22,5,6,7,8,27,10,11,13,14,29,30,15,26],mtd:[30,20],pxextens:[25,20,17],footprint:[0,1,26],settireforceapppointoffset:3,raycastsingl:4,snippetvehiclemultithread:17,gscene:[18,15,26],qryfilterdata:3,mcamberstiff:4,mrearbia:3,getangularveloc:[22,29],web:3,sap:[11,15],addloopjoint:28,sat:[31,13],per:[18,27,16,22,3,4,5,6,28,8,9,11,29,30,15,26,31],much:[18,27,22,5,8,9,11,12,30,31,29,17],win32:19,column:[30,5,7],purpos:[20,21,22,3,7,8,11,12,14,32],getmaxnbcontactdatablocksus:[18,30],pxsamplesubmarin:11,etank_wheel_5th_from_front_right:4,boxgeom:5,setdampingcoeffici:4,pseudocod:4,eacceler:22,forcedynamictreerebuild:26,adher:22,springmodifi:22,sai:[3,8,27,11,12,30],san:8,soon:[11,20,8],setsusptraveldirect:3,echannel_steer_left_control:4,getconstraintflag:22,interpret:[20,22,4,5,29,26],prohibit:26,perhap:[3,30,13],"25f":[3,5],efix_bas:28,rustl:26,raycastmultipl:4,animateleav:26,materialindex:27,pxuserrefer:4,concern:[6,20,12,13,8],isotrop:22,combin:[20,1,2,22,3,8,27,28,29,15,26,24,31],eenable_swept_integr:4,kind:[27,20,22,3,4,8,9,11,31],repxutil:4,gsteervsforwardspeedt:3,createconvexmeshsaf:5,shorter:3,ecollision_down:8,getwheelcentreoffset:3,px_max_sweep_dist:[20,26],wheelqryfilterdata:3,ang1:22,ang0:22,than:[30,28,1,31,7],maximpuls:[31,22],edrive_model_standard:4,rapidli:3,bvh:26,append:2,maxjumpheight:8,writer:6,e4th_from_front_left:4,rigiddynam:3,chunk:[11,12],getscenepvdcli:0,see:[2,3,4,5,6,13,8,9,10,14,26,28,30,31,7,11,12,18,20,21,22,29,27],pxhullpolygon:5,eanalog_input_brak:[3,4],mpeaktorqu:3,erequire_rw_lock:6,comment:29,seek_set:12,worst:26,ediff_type_open_4wd:3,onshapehit:8,longitudin:3,rebuilt:26,actorcollect:12,setflag:11,computecoriolisandcentrifugalforc:28,zonestart:0,older:[4,16,12,13],quick:[3,24,12],sse2:5,contactstream:4,kinekinefilteringmod:[31,29],centimetr:3,chassisqryfilterdata:3,enginegraphposi:3,suit:[28,12],setdigitalbrak:3,comfort:29,emax_num_drive4w_analog_input:4,bare:30,pxoverlapbuff:[4,26],raycasttouchbuffers:[3,4],manag:25,repxidtorepxobjectmap:4,enginegraphposx:3,mmateri:29,pairwis:[27,31],"catch":30,gave:12,px_unus:3,angl:[3,28,22,8],callback:[31,25,7],unnecessari:[6,2],variabl:[0,21,3,4,8,11,30,31],pxvec:5,tricki:18,getwheelrotationangl:3,useroverlaphitbuff:4,therefor:[0,18,2,22,3,5,28,26,27,11,12,30,15,29,31],gravitydirect:3,nxactor:27,gcc:16,piec:[5,8],necessari:[18,2,22,3,4,6,28,7,8,23,11,12,30,15,29,31],mbia:3,hasblock:4,instantan:[6,29],enorm:[20,26],pxvehiclesetmaxhitactoracceler:3,esteer_left_control:4,foundlostpairscapac:15,pxsphericaljointflag:22,idl:[9,8],represent:[21,22,3,5,7,12],gettirelatslip:4,initialpos:26,undesir:8,solut:[18,19,20,1,3,26,8,23,12,30,31,16],readili:3,earli:31,espher:[28,5],hadblockinghit:26,experienc:[3,4,8],maxqueri:4,mmaxomega:3,"\u215b":16,ideal:[3,11,22,8],mainli:[30,24],belong:11,cover:[2,22,7,29,27,11,30],conserv:[31,22],socket:[0,28,22],mlongitudinalstiffnessperunitgrav:3,setangularveloc:[18,4,29],textur:3,lift:3,mbuf:2,zero:[20,22,3,4,5,6,7,8,27,23,28,29,31,26],processtouchinghit:4,metal:27,setenginedata:3,network:[10,12,17],setsolveriterationcount:29,environment:27,pxactortypeselectionflag:[4,13],setdiffdata:3,actorstream:12,qanda:3,union:[5,22],createconvexmesh:5,targetwheelid:3,elock_angular_i:29,potenti:[0,20,3,4,6,8,11,12,29,30,31,26],eas:12,maneuv:3,setstretchconfig:4,disp:8,unphys:3,addforceatlocalpo:29,evisu:[22,14],resweep:31,pxdominancegrouppair:[4,29],neglig:[32,13],eenable_enhanced_determin:[30,29],pair:[28,31],disc:22,turbin:29,plastic:27,tirefrict:[3,4],ggpuloadhook:10,disk:12,fraction:29,strongli:[1,22,3,5,7,28],forth:[3,29,12,8],subsystem:[10,6],head:29,outbatchhead:29,pxserialz:4,mathemat:[3,22],unconstrain:22,gameplai:[3,8],also:[2,3,4,5,6,8,9,26,28,30,31,0,1,7,11,12,18,20,21,22,23,29,27],pxcontactmodifycallback:[27,31],polygon:8,mycontactmodif:31,testforoverlap:5,choic:[3,5,7,8,11,26],sciencedirect:3,eoutput_forc:22,pipelin:[9,30,15,6],especi:[1,3,4,6,26,23,11,30,31],damp:[22,3,4,29,27,28,16],chassisdata:3,lane:29,valuabl:[3,31],mdampingratefullthrottl:3,pxcreatephysicssdk:27,wheelgraphposx:3,engag:3,move:[30,31,25],pxvehicledrivesimdatatank:3,tree:28,"__line__":[18,7],setwheelshapemap:[3,4],dimensionless:3,emesh_both_sid:[20,13],pxheightfieldmateri:5,createident:4,oppos:[10,16,12,22],harder:3,caterpillar:3,setsimulationfilterdata:[3,4,11,29],frictionoffsetthreshold:29,getconstraint:22,pxcontrollerbehaviorcallback:[27,8],isarticulationlink:13,maya:12,etank_wheel_4th_from_front_left:4,"\u03c9":16,vital:[3,21],adapt:[27,2],onpvddisconnect:0,acycl:22,hfsampl:30,heap:15,pxfilterobjectistrigg:[27,11],extens:[20,21,1,2,22,25,7,8,28,15,17,31],creation:[22,5,7,8,30,17],respond:3,variant:[5,2,29],intermedi:7,"\u215c":16,indic:[30,28],ehol:5,full:[21,3,5,26,15,31],contract:32,respons:[2,22,3,4,5,7,26,23,12,29,15,8,31],definit:[3,12,2],setdominancegroup:29,edeleted_actor_0:13,mdrivesimdata:3,gentli:3,from:[19,20,21,2,22,5,25,7,8,28,30,15,32,17,31],spend:3,held:[18,30,22],forc:[21,25,8,28,15,17,31],pxd6axi:22,form:[0,22,3,5,7,29,28,12,15,32,31],totalmass:3,mismatch:[3,10],pxvehicletyp:4,mmaxbraketorqu:3,pxvisualdebuggerconnect:0,excess:[3,8],nbpair:[31,29],help:[0,18,21,2,22,3,4,5,6,13,30,24],nbtouch:[4,26],spent:[3,8],pxcontrollercollisionflag:[4,8],setswinglimit:28,preced:[3,28,6],non:[16,2,22,3,4,5,13,8,27,10,11,29,30,15,26,31],unoptim:30,doubl:[19,20,8,31,17,15],header:[25,7],setkinematictarget:[18,22,29],immers:31,approach:[18,22,3,5,28,13,8,27,11,12,30,31,29],fetch:[27,24],getdof:28,getwheelgraph:3,getnbcontactdatablocksus:18,cone:[28,22],actor:[30,28,25,31,7],abbot:3,tini:[31,8],prefilt:[27,4],ptr:[18,7],discuss:[22,5,25,8,11,30,31,17],twistdriv:22,gkeysmoothingdata:3,getqueryresultbuffers:3,createfrictionpair:3,particip:[3,11],subset:[1,22,5,26,10,28],camberangleatmaxdroop:3,eanalog_input_handbrak:[3,4],tireforceappcmoffset:3,isrigiddynam:13,pxbroadphasecap:11,mbatchreadi:4,pxscenelock:6,varieti:[3,22],permiss:32,inertia:[22,3,5,29,9,28,30,31],intend:[23,11,12,8],similar:[20,22,3,5,8,27,28,29,30,26],pxobserv:4,nx_default_sleep_ang_vel_squar:27,pxvehiclecopydynamicsdata:3,body1vel:22,materialcollect:12,getlinkindex:28,pxdefaultcpudispatchercr:[15,6],zone:[0,3],impulsivetorqu:29,setrigiddynamiclockflag:29,pxvehicledrivablesurfacetotirefrictionpair:[3,4],mysimulationcallback:31,removebroadphaseregion:11,esap:[11,16],unlockread:6,followjoint:12,theori:[3,32,22],pxcapsulecontrol:8,flat:3,inact:18,appropri:[22,3,6,7,30,31],pxfoundationdelayloadhook:16,planetoler:5,eanalog_input__brak:3,snowmen:29,carrawinput:3,keyboard:3,createregionsfromworldbound:11,smoke:3,flag:[28,31],under:[21,1,22,4,13,8,30,15,17,31],given:[18,20,22,3,4,5,6,28,26,11,12,30,15,29,31],setkinematicsurfaceveloc:29,histor:30,perfectli:[18,3,6,8,28,31],numcol:5,cooktrianglemesh:[27,5],tireload:3,gettaskmanag:18,etouch:[4,26],extradatastream:31,pxsetphysxdelayloadhook:10,customclass:2,gaussmaplimit:[30,16,13],setmaxprojectioniter:28,object:25,erear_left_wheel:4,damper:3,crt:1,reciproc:3,disclaim:32,getphysxgpudllnam:[10,16],vehwheelqueryresult:4,collectionb:12,anew:29,setanalogst:3,meant:24,mmaxcompress:3,pxvehiclesetsweephitrejectionangl:3,close:[20,3,5,26,28,31],below:[18,19,20,22,3,4,5,6,13,8,27,10,28,29,30,15,26,31],forcestreamcapac:15,wheelidx:3,heightscal:[30,5],triangl:[25,8,14,30,15,31],filterdata1:[3,4,11,27,31],affair:18,etank_wheel_7th_from_front_left:4,pxoverlapbuffern:4,invalid:[25,1],wakecountervalu:29,centr:3,regular:[4,5,29,11,30,16],pxccthit:4,enotify_touch_lost:6,esolve_contact:31,gvehicle4w:3,bottom:[3,29,8],setuptanksimdata:3,pxboxgeometri:[27,4,5,29],addforceatpo:[27,29],program:[10,21,7,19],pxstringtabl:12,pxvehicleupdatesinglevehicleandstoretelemetrydata:3,known:[20,3,7,8,27,28,31],etank_wheel_front_left:4,eshift_vertic:5,pxobstaclecontext:[4,8],qualiti:[21,22,3,5,26,27],fit:[32,20,8,26],verbos:12,setusermemori:4,which:[2,3,4,5,6,13,8,9,10,15,26,28,30,31,1,7,11,12,16,24,18,20,21,22,29,27],more:[2,4,5,6,13,8,9,10,15,17,26,28,30,31,7,11,12,24,18,20,21,22,25,29,27],pxvehicledrivetankrawinputdata:3,srcassets:12,danger:3,getsuspjounc:4,nxuseralloc:27,impleament:0,inde:[3,8],pxpvdtransport:[0,10],pxcreatecollect:[4,12],cannot:[18,3,4,5,6,13,8,27,11,12,29,31,26],raycastani:[4,13],determin:25,pxbroadphaseext:11,addtorqu:[18,4,29],updatemassandinertia:[4,28,5,29],pxpi:[3,5,22],getsuspraycast:4,challeng:[3,5],pxqueryhittyp:[3,4,26],analog:[3,22],neighbor:5,evehicle_type_nodr:4,eggert:3,pxccdcontactmodifycallback:[27,31],pxvehicewheelssimdata:3,mdampingr:3,externaldriveiter:28,wikipedia:3,getid:12,solverbodi:29,invers:[31,25],core:[21,1,2,25,8,30,17],clip:[30,31,5,26],dstasset:12,linearvelocityactor1:31,eposit:[4,20,26],implicit:[9,4,22],sharabl:12,custom:[31,25,7],correspond:[27,20,1,2,22,3,4,5,6,23,7,26,9,10,28,12,13,14,15,16],getexternalrefer:22,overhead:[0,20,3,6,26,30,15,29,31],driver:3,maxnbstaticshap:26,outer:[3,30],biggest:0,viewer:14,customtypeautogeneratedmetadataobject:2,targetposit:[28,22],bias:[3,22],computejointacceler:28,embp:11,gsharedindex:18,satur:3,pierc:20,correct:[18,22,5,7,8,28,12,29,30,31,16,17],reserv:[7,26,27,12,31,32],eenable_ccd:[4,31],categoris:3,rope:30,unnam:12,pxcach:29,wheelqueryresult:[3,4],max:[3,28,26,11,12,16],lightcputask:[4,17],manifold:25,neglect:29,ackermann_steering_geometri:3,map:[3,4,20,5,12],srcbinfil:12,channel:3,getsusplinestart:4,etank_wheel_front_right:4,triindic:5,getlinearveloc:[22,29],mratio:3,pxqueryfilterdata:[4,13,26],nbactor:11,access:[22,4,5,25,28,7,8,27,11,12,29,24,31,26,17],latest:6,enter:[3,11],mac:12,numprimsperleaf:16,geometr:[22,3,4,5,26,27],transport:[0,10],pxscenequeryfilterflag:4,registercustomclassbinaryseri:2,took:30,pxcomputemeshpenetr:[16,13],tirelatforcemag:3,x86:16,x87:7,seriou:[11,7],getsweepqueryresultbuff:3,degre:[3,28,29,22,8],previous:[20,16,4,13,8,11,12,29,30,26],pxboxobstacl:8,lowend:2,mwheelssimdata:[3,4],stack:31,usegeodesicteth:4,deem:18,scenario:[22,3,6,26,27,23,30,31],setparentpos:28,diagnos:30,addbroadphaseregion:11,getvisualdebugg:4,eccd_respons:4,left:[19,2,3,7,8,12],tensor:[27,30,5,29],"_aligned_malloc":[18,27,7],inch:3,pxscenflag:27,stamp:3,sourc:[19,2,22,3,5,7,27,11,12,30,32],hierarch:[9,26],human:[9,12],accumul:[3,9,29,26],pxd6motion:22,inelast:29,befor:[0,18,20,2,22,3,4,5,6,28,13,26,27,11,12,29,30,31,8],maxnbdynamicshap:26,susplinestart:4,accident:24,apex:[6,13],succinctli:27,pxd6jointdriv:22,pxhitcallback:[27,26],geomtosweep:20,getactiveactor:29,pxshapeext:[4,5,13],sm3:15,groupcollisionflag:27,wheelcount:4,increment:[21,3,6,13,26,27,12,7,30,29],estandard:[3,4],friend:2,misus:29,gettiredrivablesurfaceshap:4,ident:[20,22,3,4,5,7,29,11,12],extract:25,edetect_discrete_contact:[4,31],releasebatch:4,virtual:[0,18,2,22,4,6,7,26,9,10,27,29],fill:[20,26,5,8,12,29],requir:[19,20,16,1,2,22,4,5,28,7,26,27,11,13,29,30,15,8,31],bufferoverflowoccur:20,nx_default_sleep_lin_vel_squar:27,fetchqueri:26,pxoverlapcallback:[4,26],radian:[3,28,22],file:[30,25,7],writebuff:5,surface_type_tarmac:3,hidden:25,must:[2,3,4,5,6,13,8,10,15,26,28,30,31,0,7,11,12,16,18,20,22,29,27,32],etransmit_constraint:[0,4],face:[30,15,25],howstuffwork:3,suspspringforc:4,mswitchtim:3,carsim:3,stopsimul:6,pxvehicledrivegraphchannel:4,emul:[20,26,13,8,27,16],four:[3,6,20,1],mmass:3,decompos:[4,28,22],fact:[3,11,21,31,13],elaps:[9,27,30,8],devis:3,binari:[25,1,7],unvalid:5,"4x4":27,tabl:[5,17],setmassspaceinertiatensor:[3,29],barycentr:20,gettwistangl:16,interfer:3,actora:[27,29],selfcollis:11,settireforceshaderdata:3,inconveni:29,mmaxfilterednormalisedload:3,generatemetadata:2,perman:3,userdata:[3,4,7,8,12,29,31,26],bin:[16,1],subsect:29,origin:[20,22,5,25,7,8,27,28,31,17],ceas:29,keyhol:31,distinct:[3,18,22],nxcooktrianglemesh:27,stream:31,disabl:[31,7],pxcontrollermanag:[4,8],settwistlimit:28,setgroup:27,pxcollectionext:[4,12,13],indirect:[3,32],centric:26,samplebas:[11,6],poli:29,irrespect:[15,29],setgearsdata:3,computegraphchannel:3,pxspatialindex:27,bvh33:[16,13],newbound:5,callbackfinishtask:18,drivable_surfac:3,pole:29,tangentialstiff:4,concret:[12,22,7,26],stepoffset:8,nbpositioniter:29,pxvehiclewheelgraphchannel:[3,4],pxjointlinearlimit:[4,22],preserv:[22,3,5,8,23,12],playback:12,internalfaceindex0:31,had:[3,27,16,13,8],disadvantag:3,pxfilterdata:[27,4,11,17,31],chassisconvexmesh:3,isrigidactor:13,mutual:22,eenable_ccd_frict:31,usual:[0,20,22,3,6,8,10,28,12,29,30,31,26,24],remap:5,workaround:[27,30,31],releaseobject:[12,13],certainli:3,actor0:[4,22],actor1:[4,22],px_define_typeinfo:2,walkabl:25,eforc:29,base:[20,16,2,22,3,4,5,6,7,26,27,11,21,13,29,15,8,31],emerg:[3,20],basi:[3,28,7,11,30,15],clock:3,sever:[18,20,2,3,4,28,8,11,29,30,15,26],wheelmass:3,reach:[18,22,3,5,6,28,7,8,11,30,31,29],nxustream:27,spheregeom:20,lesser:5,send:[0,18,4,13,29,30],overflow:[4,26],tirecontactnorm:4,cuboid:3,"throw":29,pxcudacontextmanag:16,getcurrentgear:3,pxmath:8,parameter:3,runclang_window:2,downcast:27,least:[22,3,5,13,29,9,30,31],decreas:[3,30,31,29],gear:17,shown:[3,4,8],mfrictionvsslipgraph:3,mstiff:3,ediff_type_open_frontwd:3,postsolverveloc:31,diff:3,emax_nb_drivetank_analog_input:4,translateptr:2,vanish:0,etank_wheel_4th_from_front_right:4,nxcookinginterfac:27,ebvh34:[30,5],impract:3,partit:[22,26],surfacetyp:3,albeit:3,caveat:[3,7,29,28,12,15],ediff_type_ls_rearwd:3,restitutionmodifi:22,onconstraintbreak:[18,22],eswing1:22,reflect:[18,3,5,6,8,9,23,31,29],pxtrianglemeshgeometri:[11,5],eswing2:22,quickli:[30,11,17,26],eestim:3,downward:[3,29,8],therebi:[3,18],etrigger_shap:[11,31],pxconvexmesh:[3,30,5,12,7],"0xffffffff":[27,20],tunabl:3,loos:[30,5],threshold:[3,28,31,22,29],pheightfield:5,develop:[0,1,3,5,11,30],contextid:0,aid:[3,4],subgraph:22,simd:[29,5,21],ascii:12,convinc:[31,8],compil:[19,1,4,13,11,12,7,30,16],tend:[12,8],createvehicleactornw:3,subdivid:[27,11],upon:[0,3,11,18,22],contactdesc:29,pxtype:2,wide:[3,27,28,22,29],compens:3,outsid:[21,3,6,13,8,11,12,31],telemetrydata:3,getforc:22,capabl:[18,21,28,7,27,11,30,15],"\u215e":16,registerrefer:2,toward:[22,3,5,8,23,28],pxjointlimitcon:22,sweepresult:3,suppos:29,programm:6,backgroundcolor:3,processcallback:18,exampl:[2,3,4,5,6,8,10,14,15,26,28,30,31,0,1,7,11,12,16,24,18,20,21,22,29,27],createvehicleactor6w:3,ascen:5,tirecontactshap:4,princip:29,myscen:3,repxcollect:4,helloworld:25,repres:[0,18,21,1,22,3,5,28,7,26,11,12,29,30,8],pxclientbehaviorbit:4,etank_wheel_8th_from_front_left:4,enormalized_tireload:4,pxheightfielddesc:[30,5,13],dru:26,gvehicleinputdata:3,addion:5,tirecontactpoint:4,anchor:[4,22],veri:[28,31],pxtransformfromplaneequ:5,iscloth:13,hard:[30,28,21],e6th_from_front_right:4,cookclothfabr:4,pxraycasthitbuff:4,subordin:2,guidanc:13,setupdrivablesurfac:3,setdriveposit:22,pxclothfabriccook:4,pxfoundationvers:16,trajectori:30,problem:[18,22,3,6,8,23,30,31],round:[3,30,20,8],got:[24,16,6],recreat:[30,28,12],modular:3,settessflag:30,none:3,pxscenequeryfiltercallback:[4,13],complic:3,replac:[16,3,4,13,8,27,15,26],registerrepxseri:2,eanalog_input_brake_right:4,topic:[22,29],mscene:[0,9,27,6,29],evehicle_type_drivetank:4,gdefaultallocatorcallback:10,impress:31,eexclude_kinematics_from_active_actor:29,strictli:[3,8],wheelwidth:3,setlinearlimit:[16,22],icon:19,eanalog_input_thrust_right:4,portabl:21,pxprocesspxbasecallback:[27,2],suitabl:[28,21,12],setbreakforc:22,liberti:3,boilerpl:22,px_def_bin_metadata_vclass:2,addforc:[18,4,29],mesh:[25,7],deactiv:22,pxconvexmeshdesc:5,everyth:[20,5,17,8],fourth:29,velocitythreshold:22,nbsprungmass:3,progress:26,mphysic:[4,29,6,10],edisable_clean_mesh:[5,13],pointer:[2,22,3,4,5,7,26,27,11,13,31,16],thei:[18,27,20,22,3,4,6,28,7,8,9,11,12,13,29,30,31,26],diff4w:3,them:[0,18,20,21,1,22,3,4,5,6,7,26,27,11,12,29,30,31,8],eanalog_input__handbrak:3,setupfilt:11,focu:3,contrari:[31,8],setdistancelimit:[16,22],call:[2,3,4,5,6,13,8,9,10,26,28,30,31,0,1,7,11,12,16,24,18,20,21,22,29,27],echannel_norm_tire_aligning_mo:[3,4],pxvehicledrivetank:4,appear:[19,22,3,29,27,30,31,8],worker:[18,30,6,7,29],instruct:[30,7],inertiascal:31,seri:[3,2],improv:[22,3,4,5,6,26,30,31,29],contactrecord:29,nx_physics_sdk_vers:27,except:[20,1,22,3,6,8,12],fight:22,resourc:[18,10,30,6,29],pxvehicleackermanngeometrydata:17,abcdabcdabcdabcd:30,deliv:3,setmaxdepenetrationveloc:30,overrul:31,most:[0,19,20,21,1,22,3,4,5,28,7,26,11,12,29,30,31,8],inher:3,explain:[3,5,6],pxbroadphaseregion:11,getpvdconnectionfactori:4,word1:[3,11,26],pxfrictiontyp:4,welcom:[25,17],edit:3,halfforwardext:8,landscap:3,getslopelimit:8,word3:[3,26],diagram:[3,9,20,30],delta:[29,8],partner:18,substep:[27,25,31,17,30],all:[2,3,4,5,6,13,8,9,10,14,15,26,28,30,31,0,1,7,11,12,16,18,20,21,22,23,29,27,32],main:[18,20,21,3,4,6,28,13,8,9,11,12,26],snippetnodr:3,demonst:3,behavior:[25,7],profilerdata:0,clear:[3,4,13],ruler:3,occup:26,getinboundjoint:28,nativ:[15,28],around:[20,21,22,3,5,28,8,11,12,30,29],linear0:22,higher:[30,28,1],getnbshap:11,setupnondrivablesurfac:3,unitdir:[20,26],anoth:[18,20,2,22,3,4,5,6,7,8,27,11,12,30,29],npc:8,entri:[3,11,28,7],drivesimdata4w:3,dead:22,"75f":[3,4],savecel:5,pxfilterobjectattribut:[3,4,11,27,31],cosf:8,clean:[10,5,13]},filenames:["Manual/VisualDebugger","Manual/BuildingWithPhysX","Manual/ExtendingSerialization","Manual/Vehicles","Manual/MigrationTo33","Manual/Geometry","Manual/Threading","Manual/API","Manual/CharacterControllers","Manual/RigidBodyOverview","Manual/Startup","Manual/RigidBodyCollision","Manual/Serialization","Manual/MigrationTo34","Manual/DebugVisualization","Manual/GPURigidBodies","Manual/MigrationTo40","Manual/Index","Manual/Simulation","Manual/HelloWorld","Manual/GeometryQueries","Manual/Introduction","Manual/Joints","Manual/OriginShift","Manual/Statistics","Index","Manual/SceneQueries","Manual/MigrationFrom28","Manual/Articulations","Manual/RigidBodyDynamics","Manual/BestPractices","Manual/AdvancedCollisionDetection","Manual/License"],objnames:{},titles:["PhysX Visual Debugger (PVD)","Building with PhysX","Extending Serialization","Vehicles","Migrating From PhysX SDK 3.2 to 3.3","Geometry","Threading","The PhysX API","Character Controllers","Rigid Body Overview","Startup and Shutdown","Rigid Body Collision","Serialization","Migrating From PhysX SDK 3.3 to 3.4","Debug Visualization","GPU Rigid Bodies","Migrating From PhysX SDK 3.4 to 4.0","User's Guide","Simulation","Snippets","Geometry Queries","Welcome to PhysX","Joints","Scene Origin","Simulation Statistics","NVIDIA PhysX SDK 4.0 Documentation","Scene Queries","Migrating From PhysX SDK 2.x to 3.x","Articulations","Rigid Body Dynamics","Best Practices Guide","Advanced Collision Detection","PhysX License"],titleterms:{"final":30,introduct:[20,2,3,5,6,7,8,9,10,11,12,14,30,15,26],tree:30,gpu:[30,15],tessel:8,applic:[12,7],compon:10,system:[30,2],graphic:8,rigid:[9,30,11,15,29],trigger:[11,31],down:[3,10],serial:[18,4,27,12,2],kinemat:[30,11,29,8],activ:29,user:[21,17,7,26],heightfield:[27,30,20,5],creation:[3,27],featur:27,telemetri:3,releas:[30,11],domin:29,pxscene:30,setup:0,setupdrivesimdata:3,load:10,convex:[30,20,5],pose:[3,27],result:26,from:[27,4,16,6,13],enon:26,paramet:[27,30],forc:[22,29],pxpruningstructur:26,pxvehicledifferential4wdata:3,memori:[18,6,7,27,12,30],flush:30,pxvehicleclutchdata:3,snippetvehiclemultithread:3,explod:30,cmake:[16,1],mass:[22,29],seidel:29,refus:3,descript:27,invis:8,header:[27,1],pitfal:20,detect:[27,31],pxvehiclegearsdata:3,dynam:[30,28,29,8],sweep:[27,20,26],shutdown:[27,10],shift:23,foundat:10,runtim:[30,8],actor:[27,29,8],overview:[9,21,2],pxvehiclenodr:3,pxgeometryqueri:20,broad:[30,11],charact:[30,4,8],pxarticulationcach:28,project:[28,1,22],quickhul:5,drive:[28,22],iter:29,than:8,anisotrop:27,hidden:8,identifi:30,correct:3,other:[18,12],redistribut:[27,1],estatic_aabb_tre:26,obstacl:8,taskmanag:[4,6],licens:32,involv:6,pxqueryflag:26,phase:[30,11],tune:[3,15,31],task:[18,6],bottleneck:30,implement:6,eany_hit:26,notif:31,pxpruningstructuretyp:26,particl:[16,13],heighfield:11,indic:5,flag:[0,30],under:3,precis:20,distribut:16,share:30,snippethelloworld:19,pxvehicletiredata:3,"function":[20,22,26],skin:27,deseri:12,"class":[27,4,2,7],object:[21,6,7,8,9,12,30,26],capsul:[20,5,8],gener:[3,5,1,13,26],triangl:[11,5,20],slow:[3,30],first:[3,12],width:27,vertex:5,world:21,persist:31,snippetvehicle4w:3,updat:[3,8],overlap:[27,30,20,8,26],pxvehicleautoboxdata:3,pvd:[0,18,12],invalid:8,spu:4,best:[30,6,12,22],document:25,vector:8,pxvehiclewheelssimdata:3,prealloc:30,tire:3,provid:5,appli:29,auto:8,when:[3,30],physxfound:10,filter:[3,27,11,26],tabl:22,curv:3,field:[20,5],unnatur:3,convent:27,determin:[30,7,29],realist:30,perform:[30,15,31,8],actuat:22,loop:9,turn:3,epostfilt:26,fetchresult:18,algorithm:[3,30,11,5],helloworld:19,serializ:12,connect:[0,7],pxgeometryhold:5,rare:8,core:[4,16,13],interact:[11,8],preset:1,valu:8,acceler:[3,15],thread:[3,18,6,30],revolut:22,custom:[0,1,2,22],shape:[27,30,11,31],assert:7,debug:[27,30,14],setupwheelssimulationdata:3,state:[3,29],onli:5,repx:[4,12,2],multipl:[3,6,26],nxutillib:27,buffer:[18,30,6],region:11,simul:[18,27,6,9,11,30,24],feel:3,integr:18,drivabl:3,prismat:22,alloc:30,rotat:3,pxvehicleantirollbar:3,tight:30,distanc:22,restitut:29,properti:29,estim:3,proper:30,materi:27,lightcputask:6,manifold:31,refer:[3,12,13,7],stream:[0,30],advanc:[0,3,31],troubleshoot:[3,12,8],cach:[8,12,26],report:[27,31,22,7],tempor:29,control:[3,4,30,8],inform:31,pxshare:16,height:[20,5],environ:30,goe:[3,8],access:6,narrow:30,count:[12,13,7],interest:11,pxvehicledrivenw:3,cast:[27,7],point:5,dll:10,hit:[8,26],popul:11,"case":[12,8],stack:30,bodi:[9,30,11,15,29],consider:[15,6],sim:18,retarget:12,merg:26,prepar:22,registr:5,hierarchi:27,recoveri:8,immedi:29,semant:6,metadata:2,steer:3,snippetvehiclenodr:3,intern:8,design:27,contact:[3,31,5,13],incomplet:12,brief:21,level:[3,12],edynamic_aabb_tre:26,common:12,chang:[27,29],snippetseri:12,extract:31,joint:[27,30,28,22],geometri:[20,5,12,8],polygon:5,requir:12,pxdelayloadhook:10,unbound:26,nvidia:25,remov:[27,6],file:[0,1],concept:3,pxvehiclesuspensiondata:3,move:[3,8],insert:[30,11],face:5,math:[4,7],defin:[22,26],sleep:[30,28,29],corner:3,engin:3,profil:[0,30],pxsceneflag:13,swap:3,offset:8,unit:[3,7],articul:28,pair:30,cook:[5,13,27,10,12,30],part:8,snippetconvert:12,tank:3,collect:12,slope:3,torqu:[3,29],secur:10,reconnect:12,ani:26,shader:[3,22],error:[0,27,7,30],interfac:24,practic:[30,6,12],mid:30,jitter:30,place:18,disabl:[3,30],fix:[22,26],veloc:[28,29],scale:[5,22],extrapol:3,more:3,eno_block:26,creat:[11,8,28,26],despair:30,depth:[30,20,12],pxvehicledrive4w:3,consid:30,shut:10,vehicl:[3,4],issu:[30,8],extern:0,beyond:3,deprec:[16,13],against:20,pxvehicleenginedata:3,what:[19,15],snippetloadcollect:12,breakag:22,ccd:[4,31],api:[2,4,13,27,12,7,16],multithread:[27,6],pxvehiclewheeldata:3,physxcommon:10,detail:3,pxrigidactor:13,modif:[31,5],how:26,doubl:6,solver:[22,29],specul:31,cct:8,pcm:31,walkabl:8,doe:3,data:[0,18,22,5,6,12,30,31],scene:[18,23,4,6,13,26,27,11,30],order:3,configur:[1,22],cloth:[4,16],name:12,synchron:18,ackermann:3,closest:26,getactor:30,side:0,unlock:22,reduc:[30,28],eprefilt:26,volum:8,differ:7,pruner:30,continu:[27,31],behavior:[30,31,22,8],well:30,maxim:28,quantiz:30,amort:11,batch:[4,26],aggreg:11,rate:30,pxextens:13,kill:30,too:3,discuss:12,mode:[3,29,8,26],origin:23,welcom:21,bewar:30,edeprecated_trigger_trigger_report:13,constraint:22,axi:29,differenti:3,asynchron:6,much:3,deform:[27,5],singl:26,never:3,convert:12,render:[27,21],anti:3,complet:[18,12],pxgpuloadhook:10,tweak:30,lock:[6,22,29],pxmeshqueri:20,compart:27,build:[30,16,1],cpudispatch:6,step:[27,8],clutch:3,"16k":18,about:21,check:30,pxfilterdata:26,plane:[11,5,20],guid:[3,30,21,17],spheric:22,collis:[27,11,31],physx:[0,19,21,1,2,4,25,7,32,27,12,13,16],enhanc:29,split:18,bar:3,quickli:3,over:[3,8],model:[3,9],raycast:[3,27,20,31,26],time:[27,8],debugg:[0,4,13,19],pxbvhstructur:26,estat:26,surfac:[3,29],snippetvehiclescal:3,process:26,extens:[27,4,16,10],statist:24,coordin:[30,28],callback:[18,8,27,11,30,26],wall:8,extend:[0,2],rebuild:30,earli:30,pile:30,bound:30,penetr:[30,20],addit:20,delai:10,manag:[27,12,22,7,8],limit:[18,30,31,5,22],graviti:29,ground:30,roll:3,jitteri:3,gear:3,block:[18,26],option:10,specif:12,sluggish:3,forward:3,slowli:3,snippet:[3,19,12],pxrigiddynam:3,type:[3,27,5,7],preprocess:22,wheel:3,soft:22,basic:[0,19,27,22,26],event:[6,29],invers:28,index:27,visual:[0,19,21,22,4,13,14,30],comput:30,guidelin:6,scenequeri:0,basetask:6,edynam:26,subordin:12,startup:[27,10],usag:[30,24,14],pxvehicledrivetank:3,sdk:[0,3,4,25,13,27,10,16],network:0,extra:31,createshap:13,snippetvehicletank:3,multi:[3,30],posit:28,friction:[3,27,29],pxscenequeryupdatemod:26,sequenc:18,sphere:[20,5],everyth:12,modul:8,pvdclient:0,steep:3,initi:[3,20],scratch:[18,30],version:[3,12],touch:26,implic:[30,31],migrat:[27,4,16,13],pxvehicleackermanngeometrydata:3,upgrad:12,physic:[30,10],setupvehicleactor:3,climb:[3,8],substep:18,queri:[20,3,4,13,26,27,11,30],tool:12,pxvehicletireloadfilterdata:3,veri:30,hard:22,test:[27,30],sampl:30,through:8,gauss:29,higher:8,box:[20,5],spin:[3,30],car:3,mesh:[30,11,5,20],binari:[4,12,2],code:[3,12],unstabl:30,thi:21,platform:12},objtypes:{},envversion:43,objects:{}})
\ No newline at end of file
diff --git a/physx/generate_projects.sh b/physx/generate_projects.sh
old mode 100644
new mode 100755
diff --git a/physx/include/PxArticulationJoint.h b/physx/include/PxArticulationJoint.h
index c20699ba0..8a423662c 100644
--- a/physx/include/PxArticulationJoint.h
+++ b/physx/include/PxArticulationJoint.h
@@ -60,10 +60,8 @@ struct PxArticulationJointDriveType
 {
 	enum Enum
 	{
-		eNONE 	= 0,
-		eTARGET = 1,			// use the quaternion as the drive target
-		eERROR 	= 2,				// use the vector part of the quaternion as the drive error.
-		eTARGETVELOCITY = 3
+		eTARGET = 0,			// use the quaternion as the drive target
+		eERROR 	= 1				// use the vector part of the quaternion as the drive error.
 	};
 };
 
diff --git a/physx/include/PxArticulationReducedCoordinate.h b/physx/include/PxArticulationReducedCoordinate.h
index 0d77e7cfc..60abdaab3 100644
--- a/physx/include/PxArticulationReducedCoordinate.h
+++ b/physx/include/PxArticulationReducedCoordinate.h
@@ -149,14 +149,22 @@ namespace physx
 
 
 		/**
-		\brief copy the internal data of the articulation to the cache
+		\brief Sets flags on the articulation
+
+		\param[in] flags Articulation flags
 
-		\param[in] flags this indicate whether the root link is fixed
-		
-		@see createCache applyCache
 		*/
 		virtual		void							setArticulationFlags(PxArticulationFlags flags) = 0;
 
+		/**
+		\brief Raises or clears a flag on the articulation
+
+		\param[in] flag The articulation flag
+		\param[in] value true/false indicating whether to raise or clear the flag
+
+		*/
+		virtual		void							setArticulationFlag(PxArticulationFlag::Enum flag, bool value) = 0;
+
 		/**
 		\brief return PxArticulationFlags
 		*/
@@ -174,6 +182,11 @@ namespace physx
 		*/
 		virtual		PxArticulationCache*			createCache() const = 0;
 
+		/**
+		\brief Get the size of the articulation cache
+
+		\note this call may only be made on articulations that are in a scene, and may not be made during simulation
+		*/
 		virtual		PxU32							getCacheDataSize() const = 0;
 
 		/**
diff --git a/physx/include/PxRigidDynamic.h b/physx/include/PxRigidDynamic.h
index e44fb4990..de0027f66 100644
--- a/physx/include/PxRigidDynamic.h
+++ b/physx/include/PxRigidDynamic.h
@@ -126,25 +126,6 @@ class PxRigidDynamic : public PxRigidBody
 	*/
 	virtual		bool				getKinematicTarget(PxTransform& target)	const	= 0;
 
-	/**
-	\brief Sets a surface velocity on a kinematic actor. This velocity is responded to through the solver but the kinematic actor will not move.
-
-	Only has effect on kinematic actors. Will raise a warning if it is called on non-kinematic actors.
-
-	\param[in] linearVelocity The target linear velocity
-	\param[in] angularVelocity The target angular velocity
-
-	These velocities will be retained until either a kinematic target is set of clearKinematicSurfaceVelocity is set.
-	*/
-	virtual		void				setKinematicSurfaceVelocity(const PxVec3& linearVelocity, const PxVec3& angularVelocity) = 0;
-
-	/**
-	\brief Clears kinematic surface velocity.
-
-	Only has effect on kinematic actors. Will raise a warning if it is called on non-kinematic actors.
-	*/
-	virtual		void				clearKinematicSurfaceVelocity() = 0;
-
 
 /************************************************************************************************/
 /** @name Sleeping
diff --git a/physx/include/geometry/PxMeshQuery.h b/physx/include/geometry/PxMeshQuery.h
index 20681e623..f31319df2 100644
--- a/physx/include/geometry/PxMeshQuery.h
+++ b/physx/include/geometry/PxMeshQuery.h
@@ -86,7 +86,19 @@ class PxMeshQuery
 	\param[out] adjacencyIndices Returned 3 triangle adjacency triangle indices (0xFFFFFFFF if no adjacency).
 
 	\note This function will flip the triangle normal whenever triGeom.scale.hasNegativeDeterminant() is true.
-
+	\note TriangleIndex is an index used in internal format, which does have an index out of the bounds in last row.
+			To traverse all tri indices in the HF, the following code can be applied:
+			for (PxU32 row = 0; row < (nbRows - 1); row++)
+			{
+				for (PxU32 col = 0; col < (nbCols - 1); col++)
+				{
+					for (PxU32 k = 0; k < 2; k++)
+					{ 
+						const PxU32 triIndex = 2 * (row*nbCols + col) + k; 
+						....
+					}
+				}
+			}
 	@see PxTriangle PxTriangleFlags PxTriangleID findOverlapHeightField()
 	*/
 	PX_PHYSX_COMMON_API static void getTriangle(const PxHeightFieldGeometry& hfGeom, const PxTransform& transform, PxTriangleID triangleIndex, PxTriangle& triangle, PxU32* vertexIndices=NULL, PxU32* adjacencyIndices=NULL);
diff --git a/physx/include/solver/PxSolverDefs.h b/physx/include/solver/PxSolverDefs.h
index 0b676f4ab..a64edad9e 100644
--- a/physx/include/solver/PxSolverDefs.h
+++ b/physx/include/solver/PxSolverDefs.h
@@ -88,7 +88,7 @@ struct PxSolverBodyData
 	PxReal			maxContactImpulse;		//!< 80 the max contact impulse
 	PxTransform		body2World;				//!< 108 the body's transform
 	PxU16			lockFlags;				//!< 110 lock flags
-	PxU16			hasSurfaceVelocity;		//!< 112 surfaceVelocity
+	PxU16			pad;					//!< 112 pad
 
 	PX_FORCE_INLINE PxReal projectVelocity(const PxVec3& lin, const PxVec3& ang)	const
 	{
diff --git a/physx/release_notes.html b/physx/release_notes.html
index 239a389d0..05cc418f5 100644
--- a/physx/release_notes.html
+++ b/physx/release_notes.html
@@ -1,92 +1,105 @@
 <html>
 
 <head>
-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
-<title>Release Notes - NVIDIA PhysX SDK 4.0</title>
-<style type="text/css">
-	body
-	{
-		font-family: 'Trebuchet MS', 'DIN Pro', sans-serif;
-		font-size:90%;
-	}
-	h1
-	{
-		color: #000000;
-
-	}
-	h2, h3, h4, h5
-	{
-		color: #76b900;
-		margin-bottom: 0px;
-	}
-	ul
-	{
-		margin-top: 5px;
-	}
-</style>
+	<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+	<title>Release Notes - NVIDIA PhysX SDK 4.0</title>
+	<style type="text/css">
+		body {
+			font-family: 'Trebuchet MS', 'DIN Pro', sans-serif;
+			font-size: 90%;
+		}
+
+		h1 {
+			color: #000000;
+		}
+
+		h2, h3, h4, h5 {
+			color: #76b900;
+			margin-bottom: 0px;
+		}
+
+		ul {
+			margin-top: 5px;
+		}
+	</style>
 </head>
 
 <body>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 4.0</h1>
-<h2 style="TEXT-ALIGN: center">December 2018</h2>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 4.0</h1>
+	<h2 style="TEXT-ALIGN: center">December 2018</h2>
+
+	<h2>Supported Platforms</h2>
 
-    <h2>Supported Platforms</h2>
-	
 	<table cellspacing=3>
 		<tr>
 			<th width="24" />
 			<th align="left">Runtime</th>
 			<th width="18" />
-			<th align="left">Development</th> 
+			<th align="left">Development</th>
 		</tr>
 		<tr>
-			<td /><td>Apple iOS (tested on 12.1)</td>			
-			<td /><td>Xcode (tested with 10.1)</td> 
+			<td />
+			<td>Apple iOS (tested on 12.1)</td>
+			<td />
+			<td>Xcode (tested with 10.1)</td>
 		</tr>
 		<tr>
-			<td /><td>Apple macOS (tested on 10.13)</td>			
-			<td /><td>Xcode (tested with 10.1)</td> 
+			<td />
+			<td>Apple macOS (tested on 10.13)</td>
+			<td />
+			<td>Xcode (tested with 10.1)</td>
 		</tr>
 		<tr>
-			<td /><td>Google Android ARM (tested with API Level 19 - KITKAT)</td>
-			<td /><td>NDK r13b</td> 
+			<td />
+			<td>Google Android ARM (tested with API Level 19 - KITKAT)</td>
+			<td />
+			<td>NDK r13b</td>
 		</tr>
 		<tr>
-			<td /><td>Linux (tested on Ubuntu 16.04), GPU acceleration: display driver and GPU supporting CUDA 10 / CUDA ARCH 3.0</td>
-			<td /><td>Clang (tested with 3.8)</td> 
+			<td />
+			<td>Linux (tested on Ubuntu 16.04), GPU acceleration: display driver and GPU supporting CUDA 10 / CUDA ARCH 3.0</td>
+			<td />
+			<td>Clang (tested with 3.8)</td>
 		</tr>
 		<tr>
-			<td /><td>Microsoft Windows, GPU acceleration: display driver and GPU supporting CUDA 10 / CUDA ARCH 3.0</td>
-			<td /><td>Microsoft Visual Studio 2013, 2015, 2017</td> 
+			<td />
+			<td>Microsoft Windows, GPU acceleration: display driver and GPU supporting CUDA 10 / CUDA ARCH 3.0</td>
+			<td />
+			<td>Microsoft Visual Studio 2013, 2015, 2017</td>
 		</tr>
 		<tr>
-			<td /><td>Microsoft XBox One*</td>
-			<td /><td></td> 
+			<td />
+			<td>Microsoft XBox One*</td>
+			<td />
+			<td></td>
 		</tr>
 		<tr>
-			<td /><td>Nintendo Switch*</td>
-			<td /><td></td> 
+			<td />
+			<td>Nintendo Switch*</td>
+			<td />
+			<td></td>
 		</tr>
 		<tr>
-			<td /><td>Sony Playstation 4*</td>
-			<td /><td></td> 
+			<td />
+			<td>Sony Playstation 4*</td>
+			<td />
+			<td></td>
 		</tr>
 	</table>
-	
-	* Console code subject to platform NDA not available on GitHub.  Developers licensed by respective platform owners please contact NVIDIA for access. 
+
+	* Console code subject to platform NDA not available on GitHub.  Developers licensed by respective platform owners please contact NVIDIA for access.
 
 	<h2>Known Issues</h2>
 	<blockquote>
-	<ul>
-	</ul>
+		<ul></ul>
 	</blockquote>
 	<h2>Changes and Resolved Issues</h2>
 	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
 
 	</blockquote>
 
@@ -94,106 +107,111 @@ <h3>General</h3>
 	<ul>
         <li>Added:</li>
         <ul>
-            <li>A new substepping rigid body solver, which can produce improved convergence compared to the default rigid body solver.</li>
-            <li>Optional torsional friction model to simulate rotational friction when there is just a single point of contact.</li>
 			<li>A new broadphase implementation has been added. See PxBroadPhaseType::eABP for details. This is now the default broadphase implementation.</li>
-        </ul>
+		</ul>
 		<li>Removed:</li>
 		<ul>
 			<li>PhysX Particle feature.</li>
 			<li>PhysX Cloth feature.</li>
-            <li>The deprecated active transforms feature has been removed. Please use active actors instead.</li>
+			<li>The deprecated active transforms feature has been removed. Please use active actors instead.</li>
 			<li>The deprecated multi client behavior feature has been removed.</li>
 			<li>The deprecated legacy heightfields have been removed.</li>
 		</ul>
 		<li>Changed:</li>
 		<ul>
-			<li>The PhysX SDK build system is now based on CMake generated build configuration files. For more details, please refer to the PhysX SDK 4.0 Migration Guide.</li> 
+			<li>The PhysX SDK build system is now based on CMake generated build configuration files. For more details, please refer to the PhysX SDK 4.0 Migration Guide.</li>
 			<li>The Linux build has been changed to produce static as opposed to shared libraries. The compiler was switched from GCC to Clang.</li>
 			<li>The PxShared library contains functionality shared beyond the PhysX SDK. It has been streamlined to a minimal set of headers. The PxFoundation singleton has been moved back to the PhysX SDK, as well as the task manager, CUDA context manager and PhysX Visual Debugger (PVD) functionality.</li>
 			<li>PhysXDelayLoadHook and PhysXGpuLoadHook have been simplified, and PxFoundationDelayLoadHook has been removed.</li>
 		</ul>
 	</ul>
 
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Added:</li>
-        <ul>
- 			<li>A missing PxShape::getReferenceCount() function has been added.</li>
-        </ul>
-        <li>Removed:</li>
-        <ul>
-            <li>PxVisualizationParameter::eDEPRECATED_BODY_JOINT_GROUPS has been removed.</li>
-            <li>PxSceneDesc::maxNbObjectsPerRegion has been removed.</li>
-            <li>PxRigidActor::createShape() has been removed. Please use PxPhysics::createShape() or PxRigidActorExt::createExclusiveShape() instead</li>
+	<h3>Rigid Bodies</h3>
+	<ul>
+		<li>Added:</li>
+		<ul>
+			<li>A missing PxShape::getReferenceCount() function has been added.</li>
+			<li>TGS: A new rigid body solver, which can produce improved convergence compared to the default rigid body solver.</li>
+            <li>Optional torsional friction model to simulate rotational friction when there is just a single point of contact.</li>
+            <li>A flag to enable friction constraints to be processed every frame has been added</li>
+            <li>PxContactJoint added to represent contacts in inverse dynamics. Not intended for use in simulation.</li>
+		</ul>
+		<li>Removed:</li>
+		<ul>
+			<li>PxVisualizationParameter::eDEPRECATED_BODY_JOINT_GROUPS has been removed.</li>
+			<li>PxSceneDesc::maxNbObjectsPerRegion has been removed.</li>
+			<li>PxRigidActor::createShape() has been removed. Please use PxPhysics::createShape() or PxRigidActorExt::createExclusiveShape() instead</li>
 			<li>The deprecated mass parameter in PxTolerancesScale has been removed.</li>
 			<li>PxSceneFlag::eDEPRECATED_TRIGGER_TRIGGER_REPORTS has been removed.</li>
-        </ul>			
-        <li>Changed:</li>
-        <ul>
-            <li>Aggregates can now contain more than 128 actors.</li>
-            <li>Switching a kinematic object to dynamic does not automatically wake up the object anymore. Explicit calls to PxRigidDynamic::wakeUp() are now needed.</li>
-            <li>Switching a kinematic object to dynamic re-inserts the object into the broadphase, producing PxPairFlag::eNOTIFY_TOUCH_FOUND events instead of PxPairFlag::eNOTIFY_TOUCH_PERSISTS events.</li>
-            <li>PxConvexMeshGeometryFlag::eTIGHT_BOUNDS is now enabled by default for PxConvexMeshGeometry.</li>
-            <li>The default max angular velocity for rigid bodies has been changed, from 7 to 100.</li>
-			
-        </ul>
+		</ul>
+		<li>Changed:</li>
+		<ul>
+			<li>Aggregates can now contain more than 128 actors.</li>
+			<li>Switching a kinematic object to dynamic does not automatically wake up the object anymore. Explicit calls to PxRigidDynamic::wakeUp() are now needed.</li>
+			<li>Switching a kinematic object to dynamic re-inserts the object into the broadphase, producing PxPairFlag::eNOTIFY_TOUCH_FOUND events instead of PxPairFlag::eNOTIFY_TOUCH_PERSISTS events.</li>
+			<li>PxConvexMeshGeometryFlag::eTIGHT_BOUNDS is now enabled by default for PxConvexMeshGeometry.</li>
+			<li>The default max angular velocity for rigid bodies has been changed, from 7 to 100.</li>
+
+		</ul>
 	</ul>
-	
-	
-    <h3>Extensions</h3>
-    <ul>
-        <li>Added:</li>
-        <ul>
-            <li>PxD6Joint now supports per-axis linear limit pairs.</li>
-            <li>Added PxSphericalJoint::getSwingYAngle and PxSphericalJoint::getSwingZAngle.</li>
-            <li>Added PxD6Joint distance limit debug visualization.</li>
-            <li>Added PxD6JointCreate.h file with helper functions to setup the D6 joint in various common configurations.</li>
-            <li>Added pyramidal swing limits to the D6 joint.</li>
-        </ul>
-		
+
+
+	<h3>Extensions</h3>
+	<ul>
+		<li>Added:</li>
+		<ul>
+			<li>PxD6Joint now supports per-axis linear limit pairs.</li>
+			<li>Added PxSphericalJoint::getSwingYAngle and PxSphericalJoint::getSwingZAngle.</li>
+			<li>Added PxD6Joint distance limit debug visualization.</li>
+			<li>Added PxD6JointCreate.h file with helper functions to setup the D6 joint in various common configurations.</li>
+			<li>Added pyramidal swing limits to the D6 joint.</li>
+		</ul>
+
 		<li>Removed:</li>
 		<ul>
-            <li>PxComputeHeightFieldPenetration has a new signature.</li>
-            <li>PxComputeMeshPenetration has been removed. Use PxComputeTriangleMeshPenetration instead.</li>
-        </ul>
+			<li>PxComputeHeightFieldPenetration has a new signature.</li>
+			<li>PxComputeMeshPenetration has been removed. Use PxComputeTriangleMeshPenetration instead.</li>
+		</ul>
 
-        <li>Changed:</li>
-        <ul>
-            <li>PxRevoluteJoint now properly supports a -PI*2 to +PI*2 range for its limits, and the accuracy of limits has been improved.</li>
-            <li>PxD6Joint now properly supports a -PI*2 to +PI*2 range for its twist limit, and the accuracy of the twist limit has been improved.</li>
-            <li>The accuracy of the D6 joint swing limits has been improved.</li>
-            <li>PxDistanceJoint does now always insert constraint row, this change does increase the limit precision.</li>
-            <li>PxDistanceJoint::getDistance does not anymore return squared distance.</li>
-            <li>PxD6Joint::setDriveVelocity, PxD6Joint::setDrivePosition and PxRevoluteJoint::setDriveVelocity have now additional parameter autowake, which will wake the joint rigids up if true (default behavior).</li>
-        </ul>
-
-        <li>Deprecated:</li>
-        <ul>
-            <li>PxD6Joint::getTwist() has been deprecated. Please use PxD6Joint::getTwistAngle() now.</li>
-            <li>The previous PxD6Joint::setLinearLimit() and PxD6Joint::getLinearLimit() functions (supporting a single linear limit value) have been deprecated. Please use PxD6Joint::setDistanceLimit() and PxD6Joint::getDistanceLimit() instead. Or you can also use the new PxD6Joint::setLinearLimit() and PxD6Joint::getLinearLimit() functions, which now support pairs of linear limit values.</li>
-        </ul>
-    </ul>
+		<li>Changed:</li>
+		<ul>
+			<li>PxRevoluteJoint now properly supports a -PI*2 to +PI*2 range for its limits, and the accuracy of limits has been improved. In order to use extended limit ranges, PxConstraintFlag::eENABLE_EXTENDED_LIMITS must be raised on the constraint.</li>
+			<li>PxD6Joint now properly supports a -PI*2 to +PI*2 range for its twist limit, and the accuracy of the twist limit has been improved. In order to use extended limit ranges, PxConstraintFlag::eENABLE_EXTENDED_LIMITS must be raised on the constraint.</li>
+			<li>The accuracy of the D6 joint swing limits has been improved.</li>
+			<li>PxDistanceJoint does now always insert constraint row, this change does increase the limit precision.</li>
+			<li>PxDistanceJoint::getDistance does not anymore return squared distance.</li>
+			<li>PxD6Joint::setDriveVelocity, PxD6Joint::setDrivePosition and PxRevoluteJoint::setDriveVelocity have now additional parameter autowake, which will wake the joint rigids up if true (default behavior).</li>
+            <li>Joint shaders now take a bool to define whether to use extended joint limits or not.</li>
+            <li>Joint shaders must now provide the cA2w and cB2w vectors, defining the world-space location of the joint anchors for both bodies.</li>
+		</ul>
+
+		<li>Deprecated:</li>
+		<ul>
+			<li>PxD6Joint::getTwist() has been deprecated. Please use PxD6Joint::getTwistAngle() now.</li>
+			<li>The previous PxD6Joint::setLinearLimit() and PxD6Joint::getLinearLimit() functions (supporting a single linear limit value) have been deprecated. Please use PxD6Joint::setDistanceLimit() and PxD6Joint::getDistanceLimit() instead. Or you can also use the new PxD6Joint::setLinearLimit() and PxD6Joint::getLinearLimit() functions, which now support pairs of linear limit values.</li>
+		</ul>
+	</ul>
 
 	<h3>Articulations</h3>
 	<ul>
 		<li>Added:</li>
 		<ul>
-            <li>PxArticulationJoint::getParentArticulationLink and PxArticulationJoint::getChildArticulationLink has been added.</li>
-        </ul>
+			<li>New reduced coordinate articulation implementation, supporting a wider range of joint types, more accurate drive model, inverse dynamics and joint torque control.</li>
+			<li>PxArticulationJoint::getParentArticulationLink and PxArticulationJoint::getChildArticulationLink has been added.</li>
+		</ul>
 	</ul>
 
 	<h3>Scene queries</h3>
 	<ul>
 		<li>Removed:</li>
 		<ul>
-            <li>PxHitFlag::eDISTANCE has been removed.</li>
-            <li>The PxVolumeCache feature has been removed.</li>
-            <li>The PxSpatialIndex feature has been removed.</li>
+			<li>PxHitFlag::eDISTANCE has been removed.</li>
+			<li>The PxVolumeCache feature has been removed.</li>
+			<li>The PxSpatialIndex feature has been removed.</li>
 			<li>The deprecated PxSceneFlag::eSUPPRESS_EAGER_SCENE_QUERY_REFIT has been removed.</li>
-        </ul>
+		</ul>
 	</ul>
-	
+
 	<h3>Cooking</h3>
 	<ul>
 		<li>Added:</li>
@@ -202,1116 +220,580 @@ <h3>Cooking</h3>
 		</ul>
 		<li>Removed:</li>
 		<ul>
-            <li>PxPlatform enum has been removed. PhysX supported platforms all share the same endianness</li>
+			<li>PxPlatform enum has been removed. PhysX supported platforms all share the same endianness</li>
 			<li>The deprecated PxCookingParams::meshCookingHint and PxCookingParams::meshSizePerformanceTradeOff parameters have been removed.</li>
 			<li>The deprecated PxGetGaussMapVertexLimitForPlatform has been removed, use PxCookingParams::gaussMapLimit instead.</li>
 			<li>The deprecated PxConvexMeshCookingType::eINFLATION_INCREMENTAL_HULL and PxCookingParams::skinWidth have been removed.</li>
 			<li>PxBVH34MidphaseDesc::numTrisPerLeaf has been renamed to PxBVH34MidphaseDesc::numPrimsPerLeaf</li>
-        </ul>
+		</ul>
 	</ul>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-   <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.????????</h1>
-    <h2 style="TEXT-ALIGN: center">???????</h2>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-    <h3>General</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-         	<li>PxMeshQuery::getTriangle adjacency information for heightfield geometry fixed.</li>
-        </ul>
-    </ul>
-
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fixed a divide by zero bug when gjk is trying to calculate the barycentric coordinate for two identical/nearly identical points. </li>
-        </ul>
-        <li>Added:</li>
-        <ul>
-        </ul>
-    </ul>
-
-    <h3>Cooking</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-        </ul>
-        <li>Added:</li>
-        <ul>
-        </ul>
-    </ul>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.25354359</h1>
+	<h2 style="TEXT-ALIGN: center">December 2018</h2>
 
-	<h3>Character controller</h3>
+	<h2>General</h2>
 	<ul>
-		<li>Fixed:</li>
+		<li>Changed:</li>
 		<ul>
+			<li>Changed GitHub distribution to BSD license.</li>
 		</ul>
 	</ul>
 
-    <h3>Scene query</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-       </ul>
-    </ul>
+	<h2>Supported Platforms</h2>
 
-   <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<table cellspacing=3>
+		<tr>
+			<th width="24" />
+			<th align="left">Runtime</th>
+			<th width="18" />
+			<th align="left">Development</th>
+		</tr>
+		<tr>
+			<td />
+			<td>Apple iOS (tested on 12.1)</td>
+			<td />
+			<td>Xcode (tested with 10.1)</td>
+		</tr>
+		<tr>
+			<td />
+			<td>Apple macOS (tested on 10.13)</td>
+			<td />
+			<td>Xcode (tested with 10.1)</td>
+		</tr>
+		<tr>
+			<td />
+			<td>Google Android ARM (tested with API Level 16, Android 4.1 - JELLY_BEAN)</td>
+			<td />
+			<td>NDK r13b-win32</td>
+		</tr>
+		<tr>
+			<td />
+			<td>Linux (tested on Ubuntu 16.04, GPU acceleration: NVIDIA Driver version R361+ and CUDA ARCH 3.0)</td>
+			<td />
+			<td>GCC (tested with 4.8)</td>
+		</tr>
+		<tr>
+			<td />
+			<td>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</td>
+			<td />
+			<td>Microsoft Visual Studio 2012, 2013, 2015</td>
+		</tr>
+		<tr>
+			<td />
+			<td>Microsoft XBox One*</td>
+			<td />
+			<td></td>
+		</tr>
+		<tr>
+			<td />
+			<td>Nintendo Switch*</td>
+			<td />
+			<td></td>
+		</tr>
+		<tr>
+			<td />
+			<td>Sony Playstation 4*</td>
+			<td />
+			<td></td>
+		</tr>
+	</table>
 
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.24698370</h1>
-    <h2 style="TEXT-ALIGN: center">August 2018</h2>
+	* Console code subject to platform NDA not available on GitHub.  Developers licensed by respective platform owners please contact NVIDIA for access.
+	
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fixed a crash bug when EPA's edge buffer overflow. </li>
-            <li>GPU rigid bodies fixed for Volta GPUs.</li>
-        </ul>
-        <li>Added:</li>
-        <ul>
-            <li>Aggregate broad phase now runs in parallel</li>
-        </ul>
-    </ul>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.25256367</h1>
+	<h2 style="TEXT-ALIGN: center">November 2018</h2>
 
-    <h3>Cooking</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-        </ul>
-        <li>Added:</li>
-        <ul>
-        	<li>PxHeightField::getSample has been added.</li>
-        </ul>
-    </ul>
+	<h3>General</h3>
+	<ul>
+		<li>Changed:</li>
+		<ul>
+			<li>A couple of serialization write functions have been optimized.</li>
+			<li>Rtree cooking has been slightly optimized.</li>
+			<li>Renamed PxSerializer::requires to PxSerializer::requiresObjects due to an erroneous clash with C++20 keyword with apple-clang.</li>
+		</ul>
+		<li>Fixed:</li>
+		<li>Moved external vector math includes out of PhysX namespaces.</li>
+	</ul>
+	</ul>
 
-	<h3>Character controller</h3>
+	<h3>Rigid Bodies</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Capsule controller with a very small height could degenerate to spheres (with a height of exactly zero) far away from the origin, which would then triggers errors in Debug/Checked builds. This has been fixed.</li>
-			<li>The triangle array growing strategy has been changed again to prevent performance issues when tessellation is used. Memory usage may increase in these cases.</li>
-			<li>Some internal debug visualization code has been disabled and a crash in it has been fixed.</li>
+			<li>Fixed an incorrect trigger behavior when a trigger was removed and inserted within the same frame.</li>
 		</ul>
 	</ul>
 
-    <h3>Scene query</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-        	<li>Fixed possible buffer overrun when PxPruningStructure was used.</li>
-        	<li>Raycasts against a heightfield may have missed if a large distance was used.</li>
-         	<li>Sq raycasts against heightfield or triangle mesh could return a mildly negative values, this has been fixed.</li>
-       </ul>
-    </ul>
-
-   <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.24214033</h1>
-    <h2 style="TEXT-ALIGN: center">May 2018</h2>
-
-    <h3>General</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-        	<li>Fixed clang 7 unionCast issues.</li>
-			<li>Fixed the binary meta data in PhysX_3.4/Tools/BinaryMetaData for conversion of vehicles.</li>
-        </ul>
-    </ul>
-
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Deprecated:</li>
-        <ul>
-            <li>PxSceneFlag::eENABLE_KINEMATIC_STATIC_PAIRS and PxSceneDesc::eENABLE_KINEMATIC_PAIRS have been deprecated. Use the new PxPairFilteringMode parameters in PxSceneDesc instead.</li>
-        </ul>
-        <li>Fixed:</li>
-		<ul>
-            <li>A sequence of shape->setFlag(PxShapeFlag::eSIMULATION_SHAPE, false) / shape->setFlag(PxShapeFlag::eSIMULATION_SHAPE, true) without simulation calls inbetween could produce errors in the broadphase. This has been fixed.</li>
-            <li>Fixed a bug in the broadphase (SAP) that got recently introduced in SDK 3.4.1.23933511. It should only have generated a few false positives (more overlaps than stricly necessary).</li>
-        	<li>Fixed a bug in PxShape::checkMaterialSetup.</li>
-            <li>Fixed intermittent crash with GPU rigid bodies when materials were destroyed.</li>
-            <li>Fixed bug where setting maxImpulse to 0 on CCD contact modification meant contact was not reported in the frame's contact reports.</li>
-        </ul>
-    </ul>
-
-    <h3>Cooking</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-        	<li>Big convex mesh serialization used together with insertion callback stored incorrect memory for big convex data. This has been fixed.</li>
-        </ul>
-    </ul>
-
-    <h3>Scene query</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-        	<li>Fixed a bug in extended bucket pruner, when a pruning structure was added and immediatelly released.</li>
-        </ul>
-    </ul>
-
-    <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23933511</h1>
-    <h2 style="TEXT-ALIGN: center">April 2018</h2>
-
-    <h3>General</h3>
-    <ul>
-        <li>Added:</li>
-        <ul>
-            <li>Added snippet for deformable meshes, added section in the guide for them.</li>
-        </ul>
-    </ul>
-
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>PxTriangleMesh::refitBVH() was crashing with PxMeshPreprocessingFlag::eDISABLE_ACTIVE_EDGES_PRECOMPUTE. This has been fixed.</li>
-            <li>SQ-only shapes contained in aggregates could result in crashes or memory corruption when removed from the aggregate. This has been fixed.</li>
-            <li>Assert no longer fired if a dynamic body contacting a static is converted to kinematic with kinematic-static pairs enabled.</li>
-            <li>Assert reporting broad phase as being "inconsistent" could fire when using speculative CCD when sleeping objects were activated.</li>
-        </ul>
-    </ul>
-
-    <h3>Cooking</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Convex hull cooking could have produced hulls with vertices too far from convex hull planes, this has been fixed.</li>
-        </ul>
-        <li>Changed:</li>
-        <ul>
-            <li>PxCooking::createTriangleMesh does now take additional output parameter PxTriangleMeshCookingResult::Enum.</li>
-            <li>PxCooking::createConvexMesh does now take additional output parameter PxConvexMeshCookingResult::Enum.</li>
-        </ul>
-    </ul>
-
-    <h3>Scene query</h3>
-    <ul>
-        <li>Changed:</li>
-        <ul>
-            <li>PxSceneFlag::eSUPPRESS_EAGER_SCENE_QUERY_REFIT has been marked as deprecated. Was replaced with PxSceneQueryUpdateMode enum, if this new enum is set the flag gets ignored.</li>
-        </ul>
-        <li>Added:</li>
-        <ul>
-            <li>PxSceneQueryUpdateMode was added to control work done during fetchResults.</li>
-            <li>PxScene::sceneQueriesUpdate, PxScene::checkQueries and PxScene::fetchQueries were added to run separate scene query update, see manual for more details.</li>
-        </ul>
-    </ul>
-
-    <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23584284</h1>
-    <h2 style="TEXT-ALIGN: center">February 2018</h2>
-
-    <h3>General</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>PhysX sometimes froze in a spinlock after certain sequences of read & write locks. This has been fixed.</li>
-        </ul>
-    </ul>
-
-    <h3>Scene queries</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Raycasts against heightfields were sometimes missing hits for vertical rays were located exactly at the heightfield's boundaries. This has been fixed.</li>
-        </ul>
-    </ul>
-
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Avoid edge-face collisions on boundary edges when eNO_BOUNDARY_EDGES flag is raised on heightfield when using PCM and unified HF collision.</li>
-        </ul>
-    </ul>
-
-    <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23472123</h1>
-    <h2 style="TEXT-ALIGN: center">January 2018</h2>
-
-    <h3>General</h3>
-    <ul>
-        <li>Added:</li>
-        <ul>
-            <li>Visual Studio 2017 15.5.1 and newer is now supported. Samples are currently not supported with Visual Studio 2017.</li>
-        </ul>
-
-		<li>Removed:</li>
-        <ul>
-            <li>Visual Studio 2012 support is discontinued.</li>
-        </ul>
-    </ul>
-
-    <h3>Cooking</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Cooked mesh structures contained a mix of little-endian and big-endian data (the midphase structures were always saved as big-endian). This made loading of cooked files slower than necessary. This has been fixed.</li>
-        </ul>
-    </ul>
-
-    <h3>Scene queries</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Buffered moves were sometimes not properly taken into account by scene queries, leading to invalid results for one frame. This has been fixed.</li>
-            <li>Pruning structure failed to build when actor had more shapes. This has been fixed.</li>
-        </ul>
-    </ul>
-
-    <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23173160</h1>
-    <h2 style="TEXT-ALIGN: center">November 2017</h2>
-
-    <h3>Extensions</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>An issue with CCD sweeps against meshes that could potentially lead to the earliest impact not being detected has been fixed.</li>
-        </ul>
-    </ul>
-
-    <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-    <h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23131702</h1>
-    <h2 style="TEXT-ALIGN: center">November 2017</h2>
+	<h3>Scene query</h3>
+	<ul>
+		<li>Fixed:</li>
+		<ul>
+			<li>Fixed a bug in BVH34. Raycasts could fail on binary deserialized BVH34 triangle meshes.</li>
+		</ul>
+	</ul>
+	
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-    <h3>General</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>A bug in the management of internal interaction objects has been fixed.</li>
-        </ul>
-    </ul>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.24990349</h1>
+	<h2 style="TEXT-ALIGN: center">September 2018</h2>
 
-    <h3>Extensions</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>A regression in prismatic constraint stability introduced in PhysX 3.4.1 has been fixed.</li>
-            <li>PxRevoluteJoint::getAngle(), PxD6Joint::getTwist(), PxD6Joint::getSwingYAngle() and PxD6Joint::getSwingZAngle() did not always return the correct angle. This problem has been fixed.</li>
-            <li>The "double cone" case of the D6 joint had errors both in the debug visualization and the code dealing with limits. This has been fixed.</li>
-            <li>The debug visualization of the D6 joint in the twist case did not properly color-code the angular limits. This has been fixed.</li>
-            <li>The debug visualization of distance joints has been fixed.</li>
-            <li>The debug visualization of prismatic joints has been fixed.</li>
-            <li>The debug visualization of revolute joints has been fixed. It now renders active limits properly (in red when the limit is active, grey otherwise).</li>
-            <li>Proper visualization flags are now passed to the PxConstraintVisualize function. Previously all available flags were active, even if PxVisualizationParameter::eJOINT_LOCAL_FRAMES and/or PxVisualizationParameter::eJOINT_LIMITS were set to zero.</li>
-            <li>PxRevoluteJoint::getVelocity has been fixed.</li>
-        </ul>
-    </ul>
-
-    <h3>Scene queries</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Sweep queries using the eMTD flag could generate incorrect normals in sphere/sphere, sphere/capsule or capsule/capsule cases, when the objects were exactly overlapping each-other. This has been fixed.</li>
-            <li>Sweep convex mesh vs heightfield queries using the eMTD flag did not fill correctly returned faceIndex. This has been fixed.</li>
-        </ul>
-    </ul>
+	<h3>General</h3>
+	<ul>
+		<li>Fixed:</li>
+		<ul>
+			<li>PxMeshQuery::getTriangle adjacency information for heightfield geometry fixed.</li>
+			<li>Removed PxSetPhysXGpuDelayLoadHook from API. Delay loaded dynamically linked library names are now provided through PxSetPhysXDelayLoadHook.</li>
+			<li>Fixed a source of non-determinism with GPU rigid bodies.</li>
+		</ul>
+	</ul>
 
-    <!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<h3>Rigid Bodies</h3>
+	<ul>
+		<li>Fixed:</li>
+		<ul>
+			<li>Fixed a divide by zero bug when gjk is trying to calculate the barycentric coordinate for two identical/nearly identical points. </li>
+			<li>Fixed an incorrect mesh index reported in contact buffer when stabilization flag was used. </li>
+		</ul>
+	</ul>
 	
-	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1</h1>
-	<h2 style="TEXT-ALIGN: center">September 2017</h2>
-
-	<h2>Changes and Resolved Issues</h2>
-	<blockquote>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.24698370</h1>
+	<h2 style="TEXT-ALIGN: center">August 2018</h2>
 
-	</blockquote>
-
-	<h3>General</h3>
+	<h3>Rigid Bodies</h3>
 	<ul>
-		<li>Deprecated:</li>
+		<li>Fixed:</li>
 		<ul>
-			<li>The PhysX cloth feature has been deprecated.</li>
+			<li>Fixed a crash bug when EPA's edge buffer overflow. </li>
+			<li>GPU rigid bodies fixed for Volta GPUs.</li>
 		</ul>
-        <li>Changed:</li>
-        <ul>
-            <li>The meaning of PxVisualizationParameter::eCULL_BOX has changed. It is now used to visualize the current culling box, while it was previously used to enable or disable the feature. Please simply use PxScene::setVisualizationCullingBox() to enable the feature from now on.</li>
-            <li>PxVisualizationParameter::eBODY_JOINT_GROUPS has been deprecated.</li>
-            <li>Performance of setCMassLocalPose function has been improved.</li>
-        </ul>
 		<li>Added:</li>
 		<ul>
-			<li>PhysXGpu: Added warnings for the case when the dynamic gpu library fails to load.</li>
+			<li>Aggregate broad phase now runs in parallel</li>
 		</ul>
 	</ul>
 
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>A potential crash when calling detachShape has been fixed.</li>
-            <li>Fixed assert that fired if application switched a body from dynamic to kinematic, then queried kinematic target without setting one. This assert only fired if the modifications overlapped simulation so were buffered.</li>
-            <li>PxArticulation::getStabilizationThreshold() and PxArticulation::setStabilizationThreshold() were accessing the sleep threshold instead of the stabilization threshold. This has been fixed.</li>
-        </ul>
-        <li>Added:</li>
-        <ul>
-            <li>Support for modifying friction and restitution coefficients has been added to contact modification. </li>
-            <li>Added PxRigidBodyFlag::eENABLE_CCD_MAX_CONTACT_IMPULSE to enable maxContactImpulse member of PxRigidBody to be used in CCD. This is disabled by default. It is useful in some circumstances, e.g. shooting a small ball through a plate glass window and triggering it to break, but it can also result in behavioral artefacts so it is disabled by default.</li>
-            <li>Added PxSceneDesc::solverOffsetSlop. This is defaulted to a value of 0. A positive, non-zero value defines a tolerance used in the solver below which a contacts' offset from the COM of the body is negligible and therefore snapped to zero. This clamping occurs in a space tangential to the contact or friction direction. This is aimed at pool or golf simulations, where small numerical imprecision in either contact points or normals can lead to balls curving slightly when there are relatively high angular velocities involved.</li>
-        </ul>
-
+	<h3>Cooking</h3>
+	<ul>
+		<li>Fixed:</li>
+		<ul></ul>
+		<li>Added:</li>
+		<ul>
+			<li>PxHeightField::getSample has been added.</li>
+		</ul>
 	</ul>
 
-	<h3>Scene queries</h3>
+	<h3>Character controller</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>A rare invalid memory read that could lead to incorrect sweep results in case of an initial overlap has been fixed.</li>
+			<li>Capsule controller with a very small height could degenerate to spheres (with a height of exactly zero) far away from the origin, which would then triggers errors in Debug/Checked builds. This has been fixed.</li>
+			<li>The triangle array growing strategy has been changed again to prevent performance issues when tessellation is used. Memory usage may increase in these cases.</li>
+			<li>Some internal debug visualization code has been disabled and a crash in it has been fixed.</li>
 		</ul>
 	</ul>
 
-	<h3>Serialization</h3>
+	<h3>Scene query</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Binary serialization didn't preserve PxConstraintFlags, e.g. projection flags.</li>
-			<li>Xml serialization failed if a shape referencing a PxTriangleMesh was added to another (dependent) collection.</li>
+			<li>Fixed possible buffer overrun when PxPruningStructure was used.</li>
+			<li>Raycasts against a heightfield may have missed if a large distance was used.</li>
+			<li>Sq raycasts against heightfield or triangle mesh could return a mildly negative values, this has been fixed.</li>
 		</ul>
 	</ul>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.22387197</h1>
-<h2 style="TEXT-ALIGN: center">June 2017</h2>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.2.24214033</h1>
+	<h2 style="TEXT-ALIGN: center">May 2018</h2>
 
-	<h2>Changes and Resolved Issues</h2>
-	<blockquote>
-
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
-
-	</blockquote>
-
-	<h3>Cooking</h3>
+	<h3>General</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Fixed issue when PxConvexFlag::e16_BIT_INDICES was used together with PxConvexFlag::eCOMPUTE_CONVEX.</li>
-			<li>Fixed issue in convex hull cooking when postHullMerge was not executed.</li>
-            <li>Fixed crash in CCD when using bodies with 0 mass.</li>
+			<li>Fixed clang 7 unionCast issues.</li>
+			<li>Fixed the binary meta data in PhysX_3.4/Tools/BinaryMetaData for conversion of vehicles.</li>
 		</ul>
 	</ul>
 
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fixed behavioral differences when comparing the results of a given scene to the results when simulating a subset of the islands in that scene. In order for this case to be deterministic, it is necessary to raise PxSceneFlag::eENABLE_ENHANCED_DETERMINISM. </li>
-        </ul>
-
-        <li>Added:</li>
+	<h3>Rigid Bodies</h3>
+	<ul>
+		<li>Deprecated:</li>
 		<ul>
-            <li>Introduced maxBiasCoefficient in PxSceneDesc to be able to limit the coefficient used to scale error to produce the bias used in the constraint solver to correct geometric error. The default value is PX_MAX_F32 and, therefore, a value of 1/dt will be used. This value can be useful to reduce/remove jitter in scenes with variable or very small time-steps.</i>
-        </ul>
-    </ul>
-
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.22121272</h1>
-<h2 style="TEXT-ALIGN: center">May</h2>
-
-	<h2>Changes and Resolved Issues</h2>
-	<blockquote>
-
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
-
-	</blockquote>
-
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fixed a bug in convex vs convex PCM contact gen code. There were cases in which full contact gen should have been triggered but was not. </li>
-            <li>Fixed a jittering bug on both GPU/CPU codepath in PCM contact gen code. This is due to the contact recycling condition not considering the toleranceLength. </li>
-            <li>Fixed a bug causing simulation to behave incorrectly when greater than 65536 bodies were simulated.</li>
-        </ul>
-    </ul>
+			<li>PxSceneFlag::eENABLE_KINEMATIC_STATIC_PAIRS and PxSceneDesc::eENABLE_KINEMATIC_PAIRS have been deprecated. Use the new PxPairFilteringMode parameters in PxSceneDesc instead.</li>
+		</ul>
+		<li>Fixed:</li>
+		<ul>
+			<li>A sequence of shape->setFlag(PxShapeFlag::eSIMULATION_SHAPE, false) / shape->setFlag(PxShapeFlag::eSIMULATION_SHAPE, true) without simulation calls inbetween could produce errors in the broadphase. This has been fixed.</li>
+			<li>Fixed a bug in the broadphase (SAP) that got recently introduced in SDK 3.4.1.23933511. It should only have generated a few false positives (more overlaps than stricly necessary).</li>
+			<li>Fixed a bug in PxShape::checkMaterialSetup.</li>
+			<li>Fixed intermittent crash with GPU rigid bodies when materials were destroyed.</li>
+			<li>Fixed bug where setting maxImpulse to 0 on CCD contact modification meant contact was not reported in the frame's contact reports.</li>
+		</ul>
+	</ul>
 
-	<h3>Scene queries</h3>
+	<h3>Cooking</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>A rare crash that could happen with sphere overlap calls has been fixed.</li>
-            <li>Fixed a stack corruption in CCD contact modification callback.</li>
-            <li>Fixed a bug where external forces were not cleared correctly with PxArticulations.</li>
+			<li>Big convex mesh serialization used together with insertion callback stored incorrect memory for big convex data. This has been fixed.</li>
 		</ul>
 	</ul>
 
-	<h3>Cooking</h3>
+	<h3>Scene query</h3>
 	<ul>
-		<li>Changed:</li>
+		<li>Fixed:</li>
 		<ul>
-			<li>Convex hull cooking now reuses edge information, perf optimization.</li>
-			<li>PxCooking API is now const if possible.</li>
+			<li>Fixed a bug in extended bucket pruner, when a pruning structure was added and immediatelly released.</li>
 		</ul>
 	</ul>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.22017166</h1>
-<h2 style="TEXT-ALIGN: center">April 2017</h2>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23933511</h1>
+	<h2 style="TEXT-ALIGN: center">April 2018</h2>
 
-	<h2>Changes and Resolved Issues</h2>
-	<blockquote>
-
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
-
-	</blockquote>
-
-    <h3>General</h3>
-    <ul>
-        <li>Added:</li>
-        <ul>
-            <li>PxConvexMeshGeometry::maxMargin has been added. This will let the application limit how much the shape is shrunk in GJK when using by PCM contact gen</li>
-        </ul>
-    </ul>
-
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fixed a bug with joint breaking where sometimes joints would not break as expected.</li>
-            <li>Fix a race condition between cloth/particle/trigger interactions and the parallel filtering of rigid body interaction.</li>
-            <li>GPU rigid body feature now issues a performance warning in checked build if feature is enabled but PCM contact gen is not.</li>
-            <li>Fixed a bug with applying external force/torque to a body in buffered insertion stage.</li>
-            <li>Fixed a bug with CCD involving rotated static mesh actors.</li>
-            <li>Fixed a memory leak in CCD.</li>
-        </ul>
- 		<li>Changed:</li>
-        <ul>
-            <li>Optimizations for GPU rigid body feature, including reduced memory footprint and improvements to performance spikes when many objects are woken in a single frame.</li>
-        </ul>
-    </ul>
-
-    <h3>Midphase</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fix Crash in BV4 code (serialization bug).</li>
-        </ul>
-    </ul>
-
-    <h3>Cooking</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fix endless loop in convex hull cooking.</li>
-        </ul>
-    </ul>
-
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.21821222</h1>
-<h2 style="TEXT-ALIGN: center">March 2017</h2>
-
-	<h2>Supported Platforms</h2>
-	<blockquote>
-		<h3>Runtime</h3>
-		<ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android ARM (version 2.2 or later required for SDK, 2.3 or later required for snippets)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft XBox One</li>
-		<li>Nintendo Switch</li>
-		<li>Sony Playstation 4</li>
-		</ul>
-		<h3>Development</h3>
+	<h3>General</h3>
+	<ul>
+		<li>Added:</li>
 		<ul>
-				<li>Microsoft Windows XP or later</li>
-				<li>Microsoft Visual Studio 2012, 2013, 2015</li>
-				<li>Xcode 8.2</li>
+			<li>Added snippet for deformable meshes, added section in the guide for them.</li>
 		</ul>
-	</blockquote>
-	<h2>Known Issues</h2>
-	<blockquote>
-	<ul>
 	</ul>
-	</blockquote>
-	<h2>Changes and Resolved Issues</h2>
-	<blockquote>
-
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
-
-	</blockquote>
 
-	<h3>General</h3>
+	<h3>Rigid Bodies</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>A rare crash happening in the debug visualization code has been fixed.</li>
+			<li>PxTriangleMesh::refitBVH() was crashing with PxMeshPreprocessingFlag::eDISABLE_ACTIVE_EDGES_PRECOMPUTE. This has been fixed.</li>
+			<li>SQ-only shapes contained in aggregates could result in crashes or memory corruption when removed from the aggregate. This has been fixed.</li>
+			<li>Assert no longer fired if a dynamic body contacting a static is converted to kinematic with kinematic-static pairs enabled.</li>
+			<li>Assert reporting broad phase as being "inconsistent" could fire when using speculative CCD when sleeping objects were activated.</li>
 		</ul>
 	</ul>
 
-    <h3>Rigid Bodies</h3>
-    <ul>
-        <li>Fixed:</li>
-        <ul>
-            <li>Fixed a bug in PCM capsule vs plane contact gen.</li>
-            <li>A crash happening with more than 64K interactions has been fixed.</li>
-            <li>Fixed an island management issue in CCD when using multiple CCD passes.</li>
-            <li>Fixed a bug with GPU rigid bodies with non-simulation scene query-only shapes.</li>
-            <li>Fixed a bug in convex vs convex PCM contact gen code. There were cases in which full contact gen should have been triggered but was not. </li>
-            <li>Fixed a jittering bug on both GPU/CPU codepath in PCM contact gen code. This is due to the contact recycling condition not considering the toleranceLength. </li>
-        </ul>
-        <li>Added:</li>
-		<ul>
-            <li>getFrozenActors has been added to allow application queries the frozen actors</li>
-            <li>PxRigidDynamic::setKinematicSurfaceVelocity has been added, permitting the user to set a persistent velocity on a kinematic actor which behaves like a conveyor belt</li>
-            <li>PxSceneDesc::solverOffsetSlop added. This defines a threshold distance from a body's COM under which a contact will be snapped to the COM of the body inside the solver along any principal component axis</li>
-        </ul>
-    </ul>
-
-	<h3>Scene queries</h3>
+	<h3>Cooking</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>A bug in the BVH34 overlap code sometimes made PxMeshOverlapUtil::findOverlap assert (reporting an incorrect buffer overflow). This has been fixed.</li>
-            <li>Fix a bug in the case of two primitives just touching in sweep with eMTD flag on. </li>
-        </ul>
-	</ul>
-
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4</h1>
-<h2 style="TEXT-ALIGN: center">February 2017</h2>
-
-	<h2>Supported Platforms</h2>
-	<blockquote>
-		<h3>Runtime</h3>
-		<ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android ARM (version 2.2 or later required for SDK, 2.3 or later required for snippets)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft XBox One</li>
-		<li>Nintendo Switch</li>
-		<li>Sony Playstation 4</li>
+			<li>Convex hull cooking could have produced hulls with vertices too far from convex hull planes, this has been fixed.</li>
 		</ul>
-		<h3>Development</h3>
+		<li>Changed:</li>
 		<ul>
-				<li>Microsoft Windows XP or later</li>
-				<li>Microsoft Visual Studio 2012, 2013, 2015</li>
-				<li>Xcode 8.2</li>
+			<li>PxCooking::createTriangleMesh does now take additional output parameter PxTriangleMeshCookingResult::Enum.</li>
+			<li>PxCooking::createConvexMesh does now take additional output parameter PxConvexMeshCookingResult::Enum.</li>
 		</ul>
-	</blockquote>
-	<h2>Known Issues</h2>
-	<blockquote>
-	<ul>
 	</ul>
-	</blockquote>
-	<h2>Changes and Resolved Issues</h2>
-	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
-
-	<h3>General</h3>
+	<h3>Scene query</h3>
 	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>nbAggregates, nbArticulations, nbDiscreteContactPairsTotal, nbDiscreteContactPairsWithCacheHits and nbDiscreteContactPairsWithContacts have been added to PxSimulationStatistics.</li>
-			<li>PxSceneLimits::maxNbBroadPhaseOverlaps has been added.</li>
-			<li>A new midphase has been added. See PxMeshMidPhase enum for details.</li>
-			<li>Midphase descriptor has been added. See PxMidphaseDesc for details.</li>
-			<li>PxHeightField::getTimestamp() has been added</li>
-			<li>PxCloneShape has been added to enable cloning of shapes.</li>
-			<li>acquireReference() has been added to increment the reference count of shapes, materials, triangle meshes, convex meshes heightfields and cloth fabrics.</li>
-			<li>The global filter shader data block can now be set through PxScene::setFilterShaderData().</li>
-			<li>PxGpuLoadHook and PxSetPhysXGpuLoadHook has been added to support loading of GPU dll with a different name than the default.</li>
-			<li>A new profile zone has been added for debug visualization.</li>
-		</ul>
 		<li>Changed:</li>
 		<ul>
-			<li>PxMathUtils.h has moved from include/common to include/foundation</li>
-			<li>GPU support requires SM2.x (Fermi architecture, GeForce 400 series) or later hardware. SM1.x is no longer supported.</li>
-			<li>Windows XP 64-bit and Windows Vista no longer support GPU acceleration. Windows XP 32-bit still supports GPU acceleration.</li>
-			<li>PxTaskManager::createTaskManager requires error callback and does not accept SPU task manager.</li>
-			<li>PxCreatePhysics and  PxCreateBasePhysics now take an optional pointer to an  physx::PxPvd instance.  </li>
-			<li>PxConstraintConnector::updatePvdProperties now takes an optional pointer to an physx::pvdsdk::PvdDataStream instance.</li>
-			<li>PxInitExtensions now takes an optional pointer to an physx::PxPvd instance.</li>
-			<li>Shared objects (triangle mesh, convex mesh, heightfield, material, shape, cloth fabric) no longer issue an eUSER_RELEASE event when their release() method is called</li>
-			<li>PxBase::isReleasable() is now a property only of the type of an object, not the object state. In particular,
-			isReleasable() returns true for PxShape objects whose only counted reference belongs to their owning actor.</li>
-			<li>PxCollection::releaseObjects() now calls release() even on shapes whose only counted reference belongs to their owning actor. An optional parameter releaseExclusiveShapes, which defaults to true, has been added to this method to assist with common scenarios in which all shapes are created with the deprecated method PxRigidActor::createShape() or its replacement PxRigidActorExt::createExclusiveShape()</li>
-			<li>Negative mesh scale is now supported for PxTriangleMeshGeometry. Negative scale corresponds to reflection and scale along the corresponding axis. In addition to reflection PhysX will flip the triangle normals.</li>
-			<li>PxDelayLoadHook is now inherited from PxFoundationDelayLoadHook. PxFoundation dll and PxPvdSDK dll are now delay loaded inside the SDK, their names can be provided through the delay load hook.</li>
+			<li>PxSceneFlag::eSUPPRESS_EAGER_SCENE_QUERY_REFIT has been marked as deprecated. Was replaced with PxSceneQueryUpdateMode enum, if this new enum is set the flag gets ignored.</li>
 		</ul>
-		<li>Removed:</li>
+		<li>Added:</li>
 		<ul>
-			<li>Sony Playstation 3 is not supported any longer. Any related APIs have been removed.</li>
-			<li>Microsoft XBox 360 is not supported any longer. Any related APIs have been removed.</li>
-			<li>Nintendo Wii U is not supported any longer. Any related APIs have been removed.</li>
-			<li>Sony Playstation Vita is not supported any longer. Any related APIs have been removed.</li>
-			<li>Visual Studio 2010 is not supported any longer.</li>
-			<li>Microsoft Windows RT is not supported any longer.</li>
-			<li>Google Android X86 is not supported any longer.</li>
-			<li>PhysX Samples are not supported anymore except on Microsoft Windows.</li>
-			<li>Linux 32-bit no longer support GPU acceleration. Linux 64-bit still supports GPU acceleration.</li>
+			<li>PxSceneQueryUpdateMode was added to control work done during fetchResults.</li>
+			<li>PxScene::sceneQueriesUpdate, PxScene::checkQueries and PxScene::fetchQueries were added to run separate scene query update, see manual for more details.</li>
 		</ul>
+	</ul>
+
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23584284</h1>
+	<h2 style="TEXT-ALIGN: center">February 2018</h2>
+
+	<h3>General</h3>
+	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>PxScene::setFlag() does now properly send error messages in CHECKED builds if a non-mutable scene flag gets passed in.</li>
-            <li>Fixed a bug in force threshold based contact reports, which caused events to be lost.</li>
-			<li>Fixed a bug in aggregates that led to a crash when rigid bodies are added to an aggregate after removing all rigid bodies from an aggregate. This only occurred with aggregates that were added to the scene and with rigid bodies that had shapes attached.</li>
-            <li>Fixed a bug where non-breakable joints could break, leading to a crash. </li>
-            <li>Fixed RepX load of kinematic rigid bodies with mesh shapes.</li>
-            <li>Fixed a divide by zero bug in SIMD distanceSegmentTriangle function.</li>
-            <li>Debug visualization for compound bounds (PxVisualizationParameter::eCOLLISION_COMPOUNDS) now works even when all the compounds' shapes have their debug viz flag (PxShapeFlag::eVISUALIZATION) disabled.</li>
-			<li>Double-buffering now works properly for the debug visualization culling box.</li>
-			<li>The default debug visualization culling box now works correctly with heightfields.</li>
-			<li>Rare crashes in PxShape::setGeometry() and PxShape::getMaterials() have been fixed.</li>
-			<li>A crash happening when doing an origin shift on a scene containing an empty aggregate has been fixed.</li>
-		</ul>
-		<li>Deprecated:</li>
-		<ul>
-			<li>PxSceneLimits::maxNbObjectsPerRegion has been deprecated. It is currently not used.</li>
-			<li>PxComputeHeightFieldPenetration has a new signature, and the old one has been deprecated</li>
-			<li>PxComputeMeshPenetration has been deprecated. Use PxComputeTriangleMeshPenetration.</li>
-			<li>The PhysX particle feature has been deprecated.</li>
-			<li>PxTolerancesScale::mass has been deprecated.  It is currently not used.</li>
-			<li>PxActorClientBehaviorFlag has been marked as deprecated and will be removed in future releases.</li>
-		</ul>
-		<li>Removed deprecated API:</li>
-		<ul>
-			<li>PxPairFlag::eCCD_LINEAR removed. Use PxPairFlag::eDETECT_CCD_CONTACT | PxPairFlag::eSOLVE_CONTACT instead.</li>
-			<li>PxPairFlag::eRESOLVE_CONTACTS removed. Use PxPairFlag::eDETECT_DISCRETE_CONTACT | PxPairFlag::eSOLVE_CONTACT instead.</li>
-			<li>PxTriangleMeshFlag::eHAS_16BIT_TRIANGLE_INDICE removed. Use PxTriangleMeshFlag::e16_BIT_INDICES instead.</li>
-			<li>PxTriangleMeshFlag::eHAS_ADJACENCY_INFO removed. Use PxTriangleMeshFlag::eADJACENCY_INFO instead.</li>
-			<li>PxTriangleMeshDesc::convexEdgeThreshold removed.</li>
-			<li>PxSceneQueryFlag renamed to PxHitFlag.</li>
-			<li>PxHitFlag::eDIRECT_SWEEP renamed to PxHitFlag::ePRECISE_SWEEP.</li>
-			<li>PxHitFlag::eIMPACT renamed to PxHitFlag::ePOSITION.</li>
-			<li>PxSceneQueryHitType renamed to PxQueryHitType.</li>
-			<li>PxSceneQueryCache renamed to PxQueryCache.</li>
-			<li>PxSceneQueryFilterFlag renamed to PxQueryFlag.</li>
-			<li>PxSceneQueryFilterFlags renamed to PxQueryFlags.</li>
-			<li>PxSceneQueryFilterData renamed to PxQueryFilterData.</li>
-			<li>PxSceneQueryFilterCallback renamed to PxQueryFilterCallback.</li>
-			<li>PxScene::raycastAll,PxScene::raycastSingle,PxScene::raycastAny replaced by PxScene::raycast.</li>
-			<li>PxScene::overlapAll,PxScene::overlapAny replaced by PxScene::overlap.</li>
-			<li>PxScene::sweepAll,PxScene::sweepSingle,PxScene::sweepAny replaced by PxScene::sweep.</li>
-			<li>PxQuat, PxTranform, PxMat33, PxMat44 createIdentity and createZero removed. Use PxIdentity, PxZero in constructor.</li>
-			<li>PxJointType::Enum, PxJoint::getType() removed. Use PxJointConcreteType instead.</li>
-			<li>PxVisualDebugger removed. Use PxPvd instead.</li>
-			<li>PxControllerFlag renamed to PxControllerCollisionFlag.</li>
-			<li>PxCCTHit renamed to PxControllerHit.</li>
-			<li>PxCCTNonWalkableMode renamed to PxControllerNonWalkableMode.</li>
-			<li>PxControllerNonWalkableMode::eFORCE_SLIDING changed to PxControllerNonWalkableMode::ePREVENT_CLIMBING_AND_FORCE_SLIDING.</li>
-			<li>PxControllerDesc::interactionMode, groupsBitmask, callback removed.</li>
-			<li>PxController::setInteraction, getInteraction, setGroupsBitmask, getGroupsBitmask removed.</li>
-			<li>PxControllerManager::createController no longer needs PxPhysics and PxScene.</li>
-			<li>PxControllerFilters::mActiveGroups replaced with PxControllerFilters::mCCTFilterCallback.</li>
-			<li>PxSerialization::createBinaryConverter(PxSerializationRegistry&) changed to PxSerialization::createBinaryConverter().</li>
-			<li>PxConstraintDominance renamed to PxDominanceGroupPair.</li>
-			<li>PxScene::flush renamed to PxScene::flushSimulation.</li>
-			<li>PxClothFabric::getPhaseType removed.</li>
-			<li>PxCollection::addRequired removed.</li>
-			<li>PxRigidActor::createShape discontinued support for initial transform.</li>
-			<li>PxShape::resetFiltering removed.</li>
-			<li>PxParticleBase::resetFiltering removed.</li>
-			<li>PxSceneDesc::meshContactMargin removed.</li>
-			<li>PxSceneDesc::contactCorrelationDistance removed.</li>
-			<li>Indexing operators taking signed integers in PxVec3, PxVec4, PxMat33, PxMat44, PxStrideIterator have been removed.</li>
-			<li>PxHitFlag::ePOSITION, PxHitFlag::eDISTANCE and PxHitFlag::eNORMAL are now supported in PxMeshQuery::sweep function.</li>
-			<li>PxClothFlag::eGPU renamed to PxClothFlag::eCUDA.</li>
-			<li>PxActorTypeSelectionFlag/PxActorTypeSelectionFlags. Use PxActorTypeFlag/PxActorTypeFlags instead.</li>
-			<li>PxConstraintFlag::eDEPRECATED_32_COMPATIBILITY flag removed.</li>
+			<li>PhysX sometimes froze in a spinlock after certain sequences of read & write locks. This has been fixed.</li>
 		</ul>
 	</ul>
 
-	<h3>PxShared</h3>
+	<h3>Scene queries</h3>
 	<ul>
-		APEX 1.4 can now be used independently of PhysX. In order to achieve that a new shared code base was created called "PxShared". PhysX functionality such as common types, PxFoundation, the task infrastructure are now part of PxShared.
+		<li>Fixed:</li>
+		<ul>
+			<li>Raycasts against heightfields were sometimes missing hits for vertical rays were located exactly at the heightfield's boundaries. This has been fixed.</li>
+		</ul>
 	</ul>
 
 	<h3>Rigid Bodies</h3>
 	<ul>
-		<li>Added:</li>
+		<li>Fixed:</li>
 		<ul>
-			<li>An alternative simulation API has been introduced. This makes use of the following new functions: PsScene:collide(), PxScene::fetchCollision() and PxScene::advance(). Expected usage of these functions is illustrated in a new snippet SnippetSplitSim. This feature is also described in the manual in Section Simulation->SplitSim.</li>
-			<li>PxSceneFlag::eDEPRECATED_TRIGGER_TRIGGER_REPORTS has been introduced to re-enable the legacy behavior of trigger shape pairs sending reports. This flag and the corresponding legacy behavior will be removed in version 4.</li>
-			<li>The active actors feature has been added. See PxSceneFlag::eENABLE_ACTIVE_ACTORS.</li>
-			<li>Functionality to compute and manipulate mass, inertia tensor and center of mass of objects has been exposed in the new class PxMassProperties.</li>
-			<li>The option to exclude kinematics from the active actors/transforms list has been added. See PxSceneFlag::eEXCLUDE_KINEMATICS_FROM_ACTIVE_ACTORS for details.</li>
-			<li>The integrated pose of dynamic rigid bodies can be accessed earlier in the pipeline through a new callback (see the API documentation for PxSimulationEventCallback::onAdvance() and PxRigidBodyFlag::eENABLE_POSE_INTEGRATION_PREVIEW for details).</li>
-			<li>PxConvexMeshGeometryFlag::eTIGHT_BOUNDS has been added. See the user manual for details.</li>
-            <li>New split fetchResults methods introduced, see PxScene::fetchResultsBegin(), PxScene::fetchResultsFinish() and PxScene::processCallbacks(). This is intended to permit the application to parallelize the event notification callbacks.</li>
-            <li>New flag introduced to suppress updating scene query pruner trees inside fetchResults, see PxSceneFlag::eSUPPRESS_EAGER_SCENE_QUERY_REFIT. Instead, pruners will be updated during the next query.</li>
-            <li>Introduced GPU rigid body simulation support, see PxSceneFlag::eENABLE_GPU_DYNAMICS. GPU rigid body support requires SM3.0 or later.</li>
-            <li>Introduced a new GPU-accelerated broad phase. See PxBroadPhaseType::eGPU. GPU broad phase support requires SM3.0 or later.</li>
-            <li>Introduced a new enhanced determinism mode. See PxSceneFlag::eENABLE_ENHANCED_DETERMINISM. This provides additional levels of rigid body simulation determinism at the cost of some performance.</li>
-            <li>Introduced a new speculative contacts CCD approach. See PxRigidBodyFlag::eENABLE_SPECULATIVE_CCD. This is a slightly cheaper, less robust solution to PxRigidBodyFlag::eENABLE_CCD. There is no need to turn CCD on the scene using PxSceneFlag::eENABLE_CCD or enable PxPairFlag::eDETECT_CCD_CONTACT with this CCD mode as it functions as an extension to the discrete time-stepping scheme. This form of CCD can be enabled on kinematic actors. </li>
-            <li>New "immediate mode" API has been added. This exposes access to the low-level contact generation and constraint solver, which allows the application to use these PhysX low-level components to perform its own simulations without needing to populate and simulate a PxScene.</li>
-            <li>RigidDynamic lock flags added which permit the application to disallow rotation/translation of PxRigidDynamics around specific axes.</li>
+			<li>Avoid edge-face collisions on boundary edges when eNO_BOUNDARY_EDGES flag is raised on heightfield when using PCM and unified HF collision.</li>
 		</ul>
-		<li>Changed:</li>
+	</ul>
+
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23472123</h1>
+	<h2 style="TEXT-ALIGN: center">January 2018</h2>
+
+	<h3>General</h3>
+	<ul>
+		<li>Added:</li>
 		<ul>
-			<li>PxRigidStatic objects can now have zero shapes while being part of a scene.</li>
-			<li>PxContactPairFlag::eINTERNAL_HAS_FACE_INDICES is obsolete and has been removed.</li>
-			<li>PxConstraintFlag::eDEPRECATED_32_COMPATIBILITY was previously only implemented for spring constraints. It is now correctly implemented for equality constraints.</li>
-			<li>PxSceneFlag::eENABLE_PCM is enabled by default. This means PhysX uses PCM distance-based collision detection by default. </li>
-			<li>Calls to PxRigidDynamic::setWakeCounter() following PxScene::collide() do now explicitly get taken into account in the subsequent call to PxScene::advance().</li>
-            <li>Calls to contact modification callbacks can be made from multiple threads simultaneously. Therefore, modification callbacks should must be thread-safe. </li>
-            <li>Unified heightfield contact generation is now the default heightfield contact generation approach. This approach offers similar performance and behavior to contact generation with triangle meshes. Unified heightfields have no thickness because contact genreation operates on triangles so objects may tunnel if CCD is not enabled.</li>
-            <li>When unified heightfield contact generation is in use, the bounds of heightfield shapes are no longer extruded by "thickness".</li>
-			<li>PxArticulationJoint::setTwistLimit and PxArticulationJoint::getTwistLimit were incorrectly documented with zLimit and yLimit in the wrong order.  The behavior of both functions remains unchanged but now they are correctly documented with zLimit and yLimit in the correct order. This is simply a clarification of the existing function behavior.</li>
+			<li>Visual Studio 2017 15.5.1 and newer is now supported. Samples are currently not supported with Visual Studio 2017.</li>
 		</ul>
+
 		<li>Removed:</li>
 		<ul>
-			<li>The deprecated class PxFindOverlapTriangleMeshUtil has been removed. Please use PxMeshOverlapUtil instead.</li>
-			<li>The deprecated flag PxConstraintFlag::eREPORTING has been removed. Force reports are now always generated.</li>
-			<li>The following deprecated simulation event flags have been removed: PxContactPairHeaderFlag::eDELETED_ACTOR_0, ::eDELETED_ACTOR_1, PxContactPairFlag::eDELETED_SHAPE_0, ::eDELETED_SHAPE_1, PxTriggerPairFlag::eDELETED_SHAPE_TRIGGER, ::eDELETED_SHAPE_OTHER. Please use the following flags instead: PxContactPairHeaderFlag::eREMOVED_ACTOR_0, ::eREMOVED_ACTOR_1, PxContactPairFlag::eREMOVED_SHAPE_0, ::eREMOVED_SHAPE_1, PxTriggerPairFlag::eREMOVED_SHAPE_TRIGGER, ::REMOVED_SHAPE_OTHER.</li>
-			<li>The deprecated method PxPhysics::createHeightField(const PxHeightFieldDesc&) has been removed. Please use PxCooking::createHeightField(const PxHeightFieldDesc&, PxPhysicsInsertionCallback&) instead. The insertion callback can be obtained through PxPhysics::getPhysicsInsertionCallback().</li>
-		</ul>
-		<li>Deprecated:</li>
-		<ul>
-			<li>PxRigidActor::createShape() has been deprecated in favor of PxRigidActorExt::createExclusiveShape()</li>
-			<li>Trigger notification events for trigger-trigger pairs have been deprecated and will be omitted by default. See the 3.4 migration guide for more information.</li>
-			<li>The active transforms feature (PxSceneFlag::eENABLE_ACTIVETRANSFORMS) has been deprecated. Please use PxSceneFlag::eENABLE_ACTIVE_ACTORS instead. </li>
-			<li>PxRegisterHeightFields has been modified to register unified heightfields, which are now the default implementation. PxRegisterLegacyHeightFields() has been added to register the legacy (deprecated) heightfield contact gen approach.</li>
-			<li>PxHeightFieldDesc::thickness has been deprecated, as the new unified height field (see PxRegisterUnifiedHeightFields()) does not support thickness any longer.</li>
+			<li>Visual Studio 2012 support is discontinued.</li>
 		</ul>
+	</ul>
+
+	<h3>Cooking</h3>
+	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>The capsule-vs-heightfield contact generation had a rare bug where a vertical capsule standing exactly on a shared edge could fall through a mesh. This has been fixed.</li>
-			<li>The bounding box of a shape was not always properly updated when the contact offset changed.</li>
-            <li>Fixed a bug in the GJK sweep caused by lost precision in the math</li>
-			<li>Calls to PxScene::shiftOrigin() can crash when PxRigidDynamic actors with PxActorFlag::eDISABLE_SIMULATION are present.</li>
-			<li>Fixed a bug when PxShape::setMaterials was called with less materials than shape had before.</li>
-            <li>Fixed a bug in CCD that could lead to a hang in the simulation.</li>
-            <li>Fixed a bug in PCM mesh edge-edge check for the parallel case. </li>
-            <li>Fixed a bug in CCD where contact modify callbacks could be called when the CCD did not detect a contact.</li>
-            <li>Fixed a bug with applying external force/torque to a body in buffered insertion stage.</li>
+			<li>Cooked mesh structures contained a mix of little-endian and big-endian data (the midphase structures were always saved as big-endian). This made loading of cooked files slower than necessary. This has been fixed.</li>
 		</ul>
 	</ul>
 
-	<h3>Particles</h3>
+	<h3>Scene queries</h3>
 	<ul>
-		<li>Gpu: Maxwell Optimizations</li>
-		<li>The PhysX particle feature has been deprecated.</li>
+		<li>Fixed:</li>
+		<ul>
+			<li>Buffered moves were sometimes not properly taken into account by scene queries, leading to invalid results for one frame. This has been fixed.</li>
+			<li>Pruning structure failed to build when actor had more shapes. This has been fixed.</li>
+		</ul>
 	</ul>
 
-	<h3>Cloth</h3>
-	<ul>
-		<li>Continuous collision (PxClothFlag::eSWEPT_CONTACT) behavior has been optimized to reduce cloth sticking to collision shape.</li>
-		<li>Added air resistance feature (see PxCloth::setWindVelocity(PxVec3), PxCloth::setWindDrag(PxReal), PxCloth::setWindLift(PxReal), PxClothFabricDesc::nbTriangles, PxClothFabricDesc::triangles.</li>
-	</ul>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-	<h3>Serialization</h3>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23173160</h1>
+	<h2 style="TEXT-ALIGN: center">November 2017</h2>
+
+	<h3>Extensions</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>PxTriangleMesh instances with adjacency information were not correctly initialized when created with cooking.createTriangleMesh. This caused a crash when converting the binary serialized triangle mesh data. </li>
+			<li>An issue with CCD sweeps against meshes that could potentially lead to the earliest impact not being detected has been fixed.</li>
 		</ul>
 	</ul>
 
-	<h3>Character controller</h3>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1.23131702</h1>
+	<h2 style="TEXT-ALIGN: center">November 2017</h2>
+
+	<h3>General</h3>
 	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>Profile zones have been added for the character controller.</li>
-		</ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Character controllers cannot stand on dynamic triggers anymore.</li>
-			<li>Fixed: the capsule-vs-sphere sweep now returns a normal in the correct direction.</li>
+			<li>A bug in the management of internal interaction objects has been fixed.</li>
 		</ul>
 	</ul>
 
-	<h3>Vehicles</h3>
+	<h3>Extensions</h3>
 	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>Anti-roll suspension has been added.	The class PxVehicleAntiRollBar and the functions PxVehicleWheelsSimData::setAntiRollBar,
-			PxVehicleWheelsSimData::getAntiRollBar, PxVehicleWheelsSimData::getNbAntiRollBars allow anti-roll bars to be configured and queried.
-			</li>
-			<li>A new function PxVehicleSuspensionSweeps has been introduced.  This sweeps the PxShape that represents the wheel along the suspension direction. The hit planes resulting
-			from the sweep are used as driving surfaces similar to those found by PxVehicleSuspensionRaycasts.</li>
-			<li>A new snippet SnippetVehicleContactMod has been added.  This snippet demonstrates how to use sweeps and contact modification to allow the wheel's volume to fully interact
-			with the environment.</li>
-			<li>A new function PxVehicleModifyWheelContacts has been introduced.  This function analyses contact in the contact modification callback and rejects contact points that represent drivable surfaces.</li>
-			<li>A new function PxVehicleSetMaxHitActorAcceleration has been introduced.  This function sets the maximum acceleration experienced by a PxRigidDynamic that finds itself under the wheel of a vehicle.</li>
-		</ul>
-		<li>Changed:</li>
-		<ul>
-			<li>In checked build the functions PxVehicleDrive4W::allocate, PxVehicleDriveNW::allocate, PxVehicleDriveTank::allocate, PxVehicleNoDrive::allocate all return NULL and issue a warning if called before
-			PxInitVehicleSDK.</li>
-			<li>Tire width is no longer accounted for when computing the suspension compression from raycasts (PxVehicleSuspensionRaycasts).  Instead, tire width is incorporated into the suspension compression arising from swept wheels (PxVehicleSuspensionSweeps).  It is recommended to use PxVehicleSuspensionSweeps if there is a strict requirement that the inside and outside of the wheel don't visibly penetrate geometry.</li>
-		</ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Suspension force calculation now applies an extra force perpendicular to the spring travel direction.	This force is calculated to satisfy the constraint that the sprung mass only has motion along the spring travel direction.	This change mostly affects vehicles with suspension travel directions that are not vertical.</li>
-			<li>PxVehicleWheelsSimData::mThresholdLongitudinalSpeed and PxVehicleWheelsSimData::mMinLongSlipDenominator are now given default values that reflect the length scale set in PxTolerancesScale.</li>
-			<li>Unphysically large constraint forces were generated to resolve the suspension compression beyond its limit when the suspension direction and the hit normal under the wheel approach perpendicularity.  This has been fixed so that the constraint force approaches zero as the angle between the hit normal and suspension direction approaches a right angle.</li>
+			<li>A regression in prismatic constraint stability introduced in PhysX 3.4.1 has been fixed.</li>
+			<li>PxRevoluteJoint::getAngle(), PxD6Joint::getTwist(), PxD6Joint::getSwingYAngle() and PxD6Joint::getSwingZAngle() did not always return the correct angle. This problem has been fixed.</li>
+			<li>The "double cone" case of the D6 joint had errors both in the debug visualization and the code dealing with limits. This has been fixed.</li>
+			<li>The debug visualization of the D6 joint in the twist case did not properly color-code the angular limits. This has been fixed.</li>
+			<li>The debug visualization of distance joints has been fixed.</li>
+			<li>The debug visualization of prismatic joints has been fixed.</li>
+			<li>The debug visualization of revolute joints has been fixed. It now renders active limits properly (in red when the limit is active, grey otherwise).</li>
+			<li>Proper visualization flags are now passed to the PxConstraintVisualize function. Previously all available flags were active, even if PxVisualizationParameter::eJOINT_LOCAL_FRAMES and/or PxVisualizationParameter::eJOINT_LIMITS were set to zero.</li>
+			<li>PxRevoluteJoint::getVelocity has been fixed.</li>
 		</ul>
 	</ul>
 
 	<h3>Scene queries</h3>
 	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>PxPruningStructure was introduced as an optimization structure to accelerate scene queries against large sets of newly added actors.</li>
-			<li>PxScene::addActors(PxPruningStructure& ) has been added.</li>
-			<li>PxMeshQuery::sweep now supports PxHitFlag::eMESH_ANY.</li>
-			<li>PxHitFlag::eFACE_INDEX was introduced to reduce the perf cost for convex hull face index computation. In order to receive face index for sweeps against a convex hull, the flag PxHitFlag::eFACE_INDEX has to be set. Note that the face index can also be computed externally using the newly introduced method PxFindFaceIndex from the extensions library.</li>
-			<li>PxGeometryQuery::isValid was added to check provided geometry validity.</li>
-		</ul>
-		<li>Changed:</li>
-		<ul>
-			<li>Raycasts against triangle meshes with PxHitFlag::eMESH_MULTIPLE flag now return all hits, code for discarding hits close to each other has been removed.</li>
-			<li>PxPruningStructure enum has been renamed to PxPruningStructureType</li>
-		</ul>
-		<li>Deprecated:</li>
-		<ul>
-			<li>PxHitFlag::eDISTANCE has been deprecated.</li>
-			<li>The batched query feature has been deprecated.</li>
-			<li>Volume cache feature has been deprecated.</li>
-			<li>Spatial index feature has been deprecated.</li>
-		</ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>PxScene::sweep now properly implements PxHitFlag::eMESH_BOTH_SIDES (returned normal follows the same convention as for raycasts).</li>
-			<li>Raycasts against heightfields now correctly return multiple hits when PxHitFlag::eMESH_MULTIPLE flag is used.</li>
-			<li>PxSweepHit.faceIndex was computed incorrectly for sweep tests initially overlapping convex objects. The face index is now set to 0xffffffff in these cases.</li>
-			<li>Convex vs convex sweeps in PxGeometryQuery::sweep() do now correctly return the face index of the convex mesh that gets passed in as parameter geom1 (and not the one from geom0).</li>
-			<li>PxMeshQuery::sweep now supports PxHitFlag::eMESH_ANY.</li>
-			<li>Deprecated definition PxSceneQueryHit has been removed. Please use PxQueryHit instead.</li>
+			<li>Sweep queries using the eMTD flag could generate incorrect normals in sphere/sphere, sphere/capsule or capsule/capsule cases, when the objects were exactly overlapping each-other. This has been fixed.</li>
+			<li>Sweep convex mesh vs heightfield queries using the eMTD flag did not fill correctly returned faceIndex. This has been fixed.</li>
 		</ul>
 	</ul>
 
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-	<h3>Cooking</h3>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.1</h1>
+	<h2 style="TEXT-ALIGN: center">September 2017</h2>
+
+	<h2>Changes and Resolved Issues</h2>
+	<blockquote>
+
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+
+	</blockquote>
+
+	<h3>General</h3>
 	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>PxTriangleMeshCookingResult added, cookTriangleMesh now does return additional PxTriangleMeshCookingResult. Please see the manual for more information.</li>
-			<li>New convex hull generator added. It is now possible to switch between a new quickhull implementation and the legacy inflation based hull. Quickhull is the default algorithm.</li>
-			<li>Convex hulls can now be directly inserted in PxPhysics as triangle meshes and height fields.</li>
-			<li>A separate convex hull validation function has been added, it is now possible to create hulls without validation.</li>
-			<li>Convex hull generator vertex limit has two different algorithms - plane shifting and OBB slicing.</li>
-			<li>PxConvexFlag::eFAST_INERTIA_COMPUTATION added. When enabled, the inertia tensor is computed faster but with less precision.</li>
-			<li>PxConvexFlag::eGPU_COMPATIBLE added. When enabled convex hulls are created with vertex limit set to 64 and vertex limit per face is 32.</li>
-			<li>PxConvexFlag::eSHIFT_VERTICES added. When enabled input points are shifted to be around origin to improve computation stability.</li>
-			<li>PxCookingParams::gaussMapLimit has been added. The limit can now be fully user-defined. Please refer to the migration guide and best practices sections of the manual.</li>
-		</ul>
-		<li>Changed:</li>
-		<ul>
-			<li>The performance of convex creation from polygons has been improved.</li>
-		</ul>
 		<li>Deprecated:</li>
 		<ul>
-			<li>The PxPlatform enum and the PxGetGaussMapVertexLimitForPlatform function have been deprecated.</li>
+			<li>The PhysX cloth feature has been deprecated.</li>
 		</ul>
-		<li>Removed:</li>
+		<li>Changed:</li>
 		<ul>
-			<li>The deprecated flags PxMeshPreprocessingFlag::eREMOVE_UNREFERENCED_VERTICES and ::eREMOVE_DUPLICATED_TRIANGLES have been removed. Meshes get cleaned up by default unless PxMeshPreprocessingFlag::eDISABLE_CLEAN_MESH is set.</li>
+			<li>The meaning of PxVisualizationParameter::eCULL_BOX has changed. It is now used to visualize the current culling box, while it was previously used to enable or disable the feature. Please simply use PxScene::setVisualizationCullingBox() to enable the feature from now on.</li>
+			<li>PxVisualizationParameter::eBODY_JOINT_GROUPS has been deprecated.</li>
+			<li>Performance of setCMassLocalPose function has been improved.</li>
 		</ul>
-		<li>Fixed:</li>
+		<li>Added:</li>
 		<ul>
-			<li>Mesh cooking was sometimes crashing for meshes with less than 4 triangles. This has been fixed.</li>
+			<li>PhysXGpu: Added warnings for the case when the dynamic gpu library fails to load.</li>
 		</ul>
 	</ul>
 
-	<h3>Extensions</h3>
+	<h3>Rigid Bodies</h3>
 	<ul>
-		<li>Added:</li>
+		<li>Fixed:</li>
 		<ul>
-			<li>PxRaycastCCD has been added, to improve the visibility of the raycast-based CCD solution, which was previously only available in the Sample framework. This is a simpler and potentially cheaper alternative to the SDK's built-in continuous collision detection algorithm.</li>
-			<li>PxFindFaceIndex has been added. The function computes the closest polygon on a convex mesh from a given hit point and direction.</li>
+			<li>A potential crash when calling detachShape has been fixed.</li>
+			<li>Fixed assert that fired if application switched a body from dynamic to kinematic, then queried kinematic target without setting one. This assert only fired if the modifications overlapped simulation so were buffered.</li>
+			<li>PxArticulation::getStabilizationThreshold() and PxArticulation::setStabilizationThreshold() were accessing the sleep threshold instead of the stabilization threshold. This has been fixed.</li>
+			<li>Fixed an internal edge bug in PCM sphere vs mesh code</li>
+			<li>Make sure sphere vs sphere and sphere vs box in PCM contact gen generate contacts consistently on the second body when the sphere center is contained in the other shape</li>
 		</ul>
 		<li>Changed:</li>
 		<ul>
-			<li>Memory churn of PxDefaultMemoryOutputStream has been reduced.</li>
-			<li>The signatures for the PxComputeMeshPenetration and PxComputeHeightFieldPenetration functions have changed.</li>
+			<li>Improved convex vs mesh contact generation when using GPU rigid bodies. Requires the mesh to be recooked.</li>
+			<li>Improved convex vs convex contact generation when using GPU rigid bodies.</li>
+			<li>Reduced memory footprint of GPU triangle meshes significantly. Requires the mesh to be recooked.</li>
 		</ul>
-	</ul>
-
-	<h3>Profiling</h3>
-	<ul>
-		<li>Changed:</li>
+		<li>Added:</li>
 		<ul>
-			<li>Profiling information is now available only in debug, checked and profile configuration.</li>
-			<li>PxProfileZoneManager::createProfileZoneManager now takes PxAllocatorCallback as input parameter instead of PxFoundation.</li>
+			<li>Support for modifying friction and restitution coefficients has been added to contact modification. </li>
+			<li>Added PxRigidBodyFlag::eENABLE_CCD_MAX_CONTACT_IMPULSE to enable maxContactImpulse member of PxRigidBody to be used in CCD. This is disabled by default. It is useful in some circumstances, e.g. shooting a small ball through a plate glass window and triggering it to break, but it can also result in behavioral artefacts so it is disabled by default.</li>
+			<li>Added PxSceneDesc::solverOffsetSlop. This is defaulted to a value of 0. A positive, non-zero value defines a tolerance used in the solver below which a contacts' offset from the COM of the body is negligible and therefore snapped to zero. This clamping occurs in a space tangential to the contact or friction direction. This is aimed at pool or golf simulations, where small numerical imprecision in either contact points or normals can lead to balls curving slightly when there are relatively high angular velocities involved.</li>
+			<li>Added PxConvexMeshGeometry::maxMargin. This allows the application to tune the maximum amount by which PCM collision detection will shrink convex shapes in contact generation. This shrinking approach leads to some noticeable clipping around edges and vertices, but should not lead to clipping with face collisions. By default, the mesh is shrunk by an amount that is automatically computed based on the shape's properties. This allows you to limit by how much the shape will be shrunk. If the maxMargin is set to 0, then the original shape will be used for collision detection. </li>
 		</ul>
-	</ul>
 
-	<h3>Physx Visual Debugger</h3>
-	<ul>
-		<li>PhysXVisualDebuggerSDK, PvdRuntime projects replaced with PxPvdSDK.</li>
-		<li>PxPvdSceneClient::drawPoints now takes physx::pvdsdk::PvdDebugPoint as input parameter instead of PxDebugPoint. drawLines, drawTriangles, drawText and so on.</li>
-		<li>The SDK's Debug visualization data is not sent to PVD anymore in ePROFILE mode.</li>
-		<li>PxPvdSceneFlag::eTRANSMIT_CONTACTS (instead of PxPvdSceneFlag::eTRANSMIT_CONSTRAINTS) was sometimes incorrectly used to control the transmission of constraint-related data. This has been fixed. In addition, the PxPvdSceneFlag flags are now consistently ignored when PxPvdInstrumentationFlag::eDEBUG is not set.</li>
 	</ul>
 
-	<h3>Aggregates</h3>
+	<h3>Scene queries</h3>
 	<ul>
+		<li>Fixed:</li>
+		<ul>
+			<li>A rare invalid memory read that could lead to incorrect sweep results in case of an initial overlap has been fixed.</li>
+		</ul>
 	</ul>
 
-	<h3>Snippets</h3>
+	<h3>Serialization</h3>
 	<ul>
-		<li>Snippet profile zone has been removed.</li>
+		<li>Fixed:</li>
+		<ul>
+			<li>Binary serialization didn't preserve PxConstraintFlags, e.g. projection flags.</li>
+			<li>Xml serialization failed if a shape referencing a PxTriangleMesh was added to another (dependent) collection.</li>
+		</ul>
 	</ul>
-	</blockquote>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.5</h1>
-<h2 style="TEXT-ALIGN: center">??????</h2>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.22387197</h1>
+	<h2 style="TEXT-ALIGN: center">June 2017</h2>
 
-	<h2>Supported Platforms</h2>
-	<blockquote>
-    <h3>Runtime</h3>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android ARM & x86 (version 2.2 or later required for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-		<li>Microsoft XBox One (SDK only, no samples)</li>
-        <li>Microsoft XBox 360</li>
-		<li>Nintendo Wii U</li>
-        <li>Sony Playstation 3</li>
-		<li>Sony Playstation 4 (SDK only, no samples)</li>
-		<li>Sony Playstation Vita</li>
-    </ul>
-    <h3>Development</h3>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2012, 2013 and 2015</li>
-        <li>Xcode 6.3</li>
-    </ul>
-	</blockquote>
 	<h2>Changes and Resolved Issues</h2>
 	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
 
-	<h3>Rigid Bodies</h3>
+	</blockquote>
+
+	<h3>Cooking</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>A rare capsule-vs-mesh contact generation bug has been fixed.</li>
-			<li>Calls to PxScene::shiftOrigin() can crash when PxRigidDynamic actors with PxActorFlag::eDISABLE_SIMULATION are present.</li>
-			<li>A rare crash due to an invalid assert in the MBP broad-phase has been fixed. This error only affected debug & checked builds; release & profile builds were unaffected.</li>
+			<li>Fixed issue when PxConvexFlag::e16_BIT_INDICES was used together with PxConvexFlag::eCOMPUTE_CONVEX.</li>
+			<li>Fixed issue in convex hull cooking when postHullMerge was not executed.</li>
+			<li>Fixed crash in CCD when using bodies with 0 mass.</li>
 		</ul>
 	</ul>
 
-	<h3>Particles</h3>
+	<h3>Rigid Bodies</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Fixed particle collision issue with PxParticleBaseFlag::ePER_PARTICLE_COLLISION_CACHE_HINT (on by default). When particles collided against very dense triangle mesh areas an assert would be triggered or particles would leak through the triangle mesh. A workaround was to disable PxParticleBaseFlag::ePER_PARTICLE_COLLISION_CACHE_HINT.</li>
+			<li>Fixed behavioral differences when comparing the results of a given scene to the results when simulating a subset of the islands in that scene. In order for this case to be deterministic, it is necessary to raise PxSceneFlag::eENABLE_ENHANCED_DETERMINISM. </li>
 		</ul>
-	</ul>
 
-	<h3>Character controller</h3>
-	<ul>
 		<li>Added:</li>
 		<ul>
-			<li>Added PxControllerDesc::registerDeletionListener boolean defining if deletion listener for CCT should be registered.</li>
+			<li>Introduced maxBiasCoefficient in PxSceneDesc to be able to limit the coefficient used to scale error to produce the bias used in the constraint solver to correct geometric error. The default value is PX_MAX_F32 and, therefore, a value of 1/dt will be used. This value can be useful to reduce/remove jitter in scenes with variable or very small time-steps.</i>
 		</ul>
+	</ul>
+
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.22121272</h1>
+	<h2 style="TEXT-ALIGN: center">May</h2>
+
+	<h2>Changes and Resolved Issues</h2>
+	<blockquote>
+
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+
+	</blockquote>
+
+	<h3>Rigid Bodies</h3>
+	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Fixed a bug where CCT shapes initially overlapping static geometry would be moved down by an incorrect amount (the length of the step offset).</li>
-			<li>The overlap recovery module now works against kinematic objects.</li>
+			<li>Fixed a bug in convex vs convex PCM contact gen code. There were cases in which full contact gen should have been triggered but was not. </li>
+			<li>Fixed a jittering bug on both GPU/CPU codepath in PCM contact gen code. This is due to the contact recycling condition not considering the toleranceLength. </li>
+			<li>Fixed a bug causing simulation to behave incorrectly when greater than 65536 bodies were simulated.</li>
 		</ul>
 	</ul>
 
-	<h3>Scene Queries</h3>
+	<h3>Scene queries</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>PxGeometryQuery::computePenetration with convex geometries.</li>
-			<li>On Android platforms, the eDYNAMIC_AABB_TREE pruning structure could pass already released objects into the scene query filter callback.</li>
+			<li>A rare crash that could happen with sphere overlap calls has been fixed.</li>
+			<li>Fixed a stack corruption in CCD contact modification callback.</li>
+			<li>Fixed a bug where external forces were not cleared correctly with PxArticulations.</li>
 		</ul>
 	</ul>
 
-
 	<h3>Cooking</h3>
 	<ul>
-		<li>Fixed:</li>
+		<li>Changed:</li>
 		<ul>
-			<li>Cooking convex mesh from a flat input mesh produced incorrect large mesh.</li>
+			<li>Convex hull cooking now reuses edge information, perf optimization.</li>
+			<li>PxCooking API is now const if possible.</li>
 		</ul>
 	</ul>
 
-	</blockquote>
-
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.4</h1>
-<h2 style="TEXT-ALIGN: center">October 2015</h2>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.22017166</h1>
+	<h2 style="TEXT-ALIGN: center">April 2017</h2>
 
-	<h2>Supported Platforms</h2>
-	<blockquote>
-    <h3>Runtime</h3>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android ARM & x86 (version 2.2 or later required for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-		<li>Microsoft XBox One (SDK only, no samples)</li>
-        <li>Microsoft XBox 360</li>
-		<li>Nintendo Wii U</li>
-        <li>Sony Playstation 3</li>
-		<li>Sony Playstation 4 (SDK only, no samples)</li>
-		<li>Sony Playstation Vita</li>
-    </ul>
-    <h3>Development</h3>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2012, 2013 and 2015</li>
-        <li>Xcode 6.3</li>
-    </ul>
-	</blockquote>
 	<h2>Changes and Resolved Issues</h2>
 	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+
+	</blockquote>
 
 	<h3>General</h3>
 	<ul>
-		<li>Added support for Microsoft Visual Studio 2015 for Windows builds. Note that the GPU features are not currently supported with Visual Studio 2015.</li>
-		<li>Removed support for Microsoft Visual Studio 2010</li>
-		<li>Added support for Android platform API Level 16 and removed support for platform API Level 8 and 9.</li>
-		<li>Fixed:</li>
+		<li>Added:</li>
 		<ul>
-			<li>Fixed a bug in aggregates that led to a crash when rigid bodies are added to an aggregate after removing all rigid bodies from an aggregate.   This only occurred with aggregates that were added to the scene and with rigid bodies that had shapes attached.</li>
+			<li>PxConvexMeshGeometry::maxMargin has been added. This will let the application limit how much the shape is shrunk in GJK when using by PCM contact gen</li>
 		</ul>
 	</ul>
 
@@ -1319,33 +801,24 @@ <h3>Rigid Bodies</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Creating a PxConstraint or using PxConstraint::setActors() could cause a crash in situations where one of the two newly connected actors was part of a previously simulated graph of connected constraints (with some of them having projection enabled, i.e., PxConstraintFlag::ePROJECT_TO_ACTOR0 or ::ePROJECT_TO_ACTOR1 set).</li>
-			<li>PxD6Joint::getTwist(), getSwingY(), getSwingZ() returned incorrect angle values when the corresponding components of the quaternion were negative</li>
-			<li>The thickness of a heightfield was incorrectly applied when the heightfield transform had a non-identity quaternion. </li>
-			<li>PxD6Joint angular projection now functions correctly when there is one free axis and it is not the twist axis.</li>
-			<li>The bounding box of a shape was not always properly updated when the contact offset changed.</li>
-            <li>Fixed an edge case bug in PCM contact gen that could result in a QNAN reported as the contact point.</li>
-            <li>Fixed an uninitialized variable bug in the GJK algorithm resulting in unintialized closest points reported.</li>
-            <li>Fixed an edge case in which the constraint solver could access invalid memory in constraint partitioning.</li>
-            <li>Fixed a bug in capsule vs heightfield contact generation that could produce incorrect penetration depths.</li>
-            <li>Fixed a bug in Unified MTD code path which transformed the normal twice for the polygon index calculation. </li>
-			<li>Fixed a crash in height fields when a capsule collided with an edge whose shared triangles had a hole material.</li>
+			<li>Fixed a bug with joint breaking where sometimes joints would not break as expected.</li>
+			<li>Fix a race condition between cloth/particle/trigger interactions and the parallel filtering of rigid body interaction.</li>
+			<li>GPU rigid body feature now issues a performance warning in checked build if feature is enabled but PCM contact gen is not.</li>
+			<li>Fixed a bug with applying external force/torque to a body in buffered insertion stage.</li>
+			<li>Fixed a bug with CCD involving rotated static mesh actors.</li>
+			<li>Fixed a memory leak in CCD.</li>
 		</ul>
-	</ul>
-
-	<h3>Serialization</h3>
-	<ul>
-		<li>Fixed:</li>
+		<li>Changed:</li>
 		<ul>
-			<li>PxTriangleMesh instances with adjacency information were not correctly initialized when created with cooking.createTriangleMesh. This caused a crash when converting the binary serialized triangle mesh data.</li>
+			<li>Optimizations for GPU rigid body feature, including reduced memory footprint and improvements to performance spikes when many objects are woken in a single frame.</li>
 		</ul>
 	</ul>
 
-	<h3>Scene Queries</h3>
+	<h3>Midphase</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Sweeps against scaled meshes.</li>
+			<li>Fix Crash in BV4 code (serialization bug).</li>
 		</ul>
 	</ul>
 
@@ -1353,69 +826,53 @@ <h3>Cooking</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Mesh cooking was sometimes crashing for meshes with less than 4 triangles. This has been fixed.</li>
+			<li>Fix endless loop in convex hull cooking.</li>
 		</ul>
 	</ul>
 
-	</blockquote>
-
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.3</h1>
-<h2 style="TEXT-ALIGN: center">January 2015</h2>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4.0.21821222</h1>
+	<h2 style="TEXT-ALIGN: center">March 2017</h2>
 
 	<h2>Supported Platforms</h2>
 	<blockquote>
-    <h3>Runtime</h3>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android ARM & x86 (version 2.2 or later required for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-		<li>Microsoft XBox One (SDK only, no samples)</li>
-        <li>Microsoft XBox 360</li>
-		<li>Nintendo Wii U</li>
-        <li>Sony Playstation 3</li>
-		<li>Sony Playstation 4 (SDK only, no samples)</li>
-		<li>Sony Playstation Vita</li>
-    </ul>
-    <h3>Development</h3>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2010, 2012 and 2013</li>
-        <li>Xcode 6.2</li>
-    </ul>
+		<h3>Runtime</h3>
+		<ul>
+			<li>Apple iOS</li>
+			<li>Apple Mac OS X</li>
+			<li>Google Android ARM (version 2.2 or later required for SDK, 2.3 or later required for snippets)</li>
+			<li>Linux (tested on Ubuntu)</li>
+			<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
+			<li>Microsoft XBox One</li>
+			<li>Nintendo Switch</li>
+			<li>Sony Playstation 4</li>
+		</ul>
+		<h3>Development</h3>
+		<ul>
+			<li>Microsoft Windows XP or later</li>
+			<li>Microsoft Visual Studio 2012, 2013, 2015</li>
+			<li>Xcode 8.2</li>
+		</ul>
 	</blockquote>
 	<h2>Known Issues</h2>
 	<blockquote>
-	<ul>
-		<li>The combination of releasing an actor and reassigning the actors of any affected joint so that the joint no longer references the released actor will lead to a crash if these operations are performed as buffered calls ie after simulate but before fetchResults.</li>
-	</ul>
+		<ul></ul>
 	</blockquote>
 	<h2>Changes and Resolved Issues</h2>
 	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+
+	</blockquote>
 
 	<h3>General</h3>
 	<ul>
-		<li>Added support for Microsoft Visual Studio 2013 and removed support for Microsoft Visual Studio 2008.</li>
-		<li>Where applicable, mutexes on Unix flavored platforms now inherit priority to avoid priority inversion. This is a default behavior of Windows mutexes.</li>
-		<li>Added arm64 support for the iOS version of the PhysX SDK.</li>
-		<li>Removed samples from the iOS version of the PhysX SDK.</li>
 		<li>Fixed:</li>
 		<ul>
-			<li>x64 samples running on Windows 8.</li>
-			<li>Concurrent calls to PxPhysics::(un)registerDeletionListener() and PxPhysics::(un)registerDeletionListenerObjects() were not safe.</li>
-			<li>PxGeometryQuery::pointDistance() could sometimes read uninitialized memory. This has been fixed.</li>
-			<li>The SDK will not crash anymore if an object is involved in more than 65535 interactions. Instead, it will emit an error message and the additional interactions will be ignored.</li>
-			<li>The PxPhysics::getMaterials() function did not work with a non-zero 'startIndex' parameter. This has been fixed.</li>
-			<li>The following static Android libs are now packed into a single library called PhysX3: LowLevel, LowLevelCloth, PhysX3, PhysXProfileSDK, PhysXVisualDebuggerSDK, PvdRuntime, PxTask, SceneQuery and SimulationController. This fixes cyclic dependencies between these libraries.</li>
-			<li>FSqrt(0), V4Sqrt(0), V4Length(0) and V3Length(0) will return 0 instead of QNan in Android and iOS.</li>
+			<li>A rare crash happening in the debug visualization code has been fixed.</li>
 		</ul>
 	</ul>
 
@@ -1423,2209 +880,2801 @@ <h3>Rigid Bodies</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>The Prismatic joint limit now acts correctly when its frame is not the identity.</li>
-			<li>Calling PxRigidDynamic::setGlobalPose() with the autowake parameter set to true could result in an assert when the rigid body got released and had the PxActorFlag::eDISABLE_SIMULATION flag set.</li>
-			<li>Added errors on misuse of PxRegister[Unified]Heightfields() function, and documented it.</li>
-			<li>PxConstraint has a eDISABLE_PREPROCESSING flag and minResponseThreshold attribute to assist in stabilizing stiffness caused by infinite inertias or mass modification.</li>
-			<li>Island manager performance in the presence of large numbers of kinematic actors has been improved.</li>
-			<li>Using PxConstraint::setActors() or PxJoint::setActors() could cause a crash if the new actors resulted in the constraint/joint being removed from the scene.</li>
-			<li>The functions PxRigidActor::setGlobalPose and PxShape::setLocalPose failed to update cached contact data internal to PhysX.  This led to contacts being generated with transforms that were no longer valid.  Similarly, contacts could be missed due to transforms being invalid.  This defect affected the classes PxRigidStatic and PxRigidDynamic, though it was more immediately noticeable with the PxRigidStatic class.</li>
-			<li>The sphere-vs-mesh contact generation code has been improved. It could previously generate wrong contacts. This has been fixed.</li>
-			<li>The capsule-vs-convex contact generation had a bug that could lead to rare invalid contacts. This has been fixed.</li>
-			<li>The mesh contact generation had a bug on PS3 that could lead to invalid contacts. This has been fixed.</li>
-			<li>The PxRigidBody::clearForce() and PxRigidBody::clearTorque() were not properly decoupled - they both cleared the force and the torque. This has been fixed.</li>
-			<li>Repeatedly calling PxRigidActor::attachShape and PxRigidActor::detachShape in between calls to simulate resulted in a memory leak.  This has been fixed.</li>
+			<li>Fixed a bug in PCM capsule vs plane contact gen.</li>
+			<li>A crash happening with more than 64K interactions has been fixed.</li>
+			<li>Fixed an island management issue in CCD when using multiple CCD passes.</li>
+			<li>Fixed a bug with GPU rigid bodies with non-simulation scene query-only shapes.</li>
+			<li>Fixed a bug in convex vs convex PCM contact gen code. There were cases in which full contact gen should have been triggered but was not. </li>
+			<li>Fixed a jittering bug on both GPU/CPU codepath in PCM contact gen code. This is due to the contact recycling condition not considering the toleranceLength. </li>
 		</ul>
 		<li>Added:</li>
 		<ul>
-			<li>Enabling CCD on kinematic actors is now disallowed. When the kinematic flags are raised on a CCD-enabled actor, CCD is automatically disabled. </li>
+			<li>getFrozenActors has been added to allow application queries the frozen actors</li>
+			<li>PxRigidDynamic::setKinematicSurfaceVelocity has been added, permitting the user to set a persistent velocity on a kinematic actor which behaves like a conveyor belt</li>
+			<li>PxSceneDesc::solverOffsetSlop added. This defines a threshold distance from a body's COM under which a contact will be snapped to the COM of the body inside the solver along any principal component axis</li>
 		</ul>
 	</ul>
 
-	<h3>Particles</h3>
+	<h3>Scene queries</h3>
 	<ul>
 		<li>Fixed:</li>
 		<ul>
-			<li>Consistency between GPU and CPU particles has been improved in the case of a spatial date structure overflow. The positions and velocities of particles that have the PxParticleFlag::eSPATIAL_DATA_STRUCTURE_OVERFLOW set
-			are now updated also for CPU particles.</li>
-			<li>Fixed potential deadlocks from occurring in the GPU particle kernels running on GM204 and above GPUs.</li>
-			<li>Fixed fluid simulation crash on Titan X.</li>
+			<li>A bug in the BVH34 overlap code sometimes made PxMeshOverlapUtil::findOverlap assert (reporting an incorrect buffer overflow). This has been fixed.</li>
+			<li>Fix a bug in the case of two primitives just touching in sweep with eMTD flag on. </li>
 		</ul>
 	</ul>
 
-	<h3>Cloth</h3>
-	<ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>A bug related to hitting the sphere or plane limit while world collision is enabled has been fixed.</li>
-			<li>PxCloth::getParticleAccelerations() implementation was fixed for GPU cloth.</li>
-		</ul>
-	</ul>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-	<h3>Character controller</h3>
-	<ul>
-		<li>Fixed:</li>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.4</h1>
+	<h2 style="TEXT-ALIGN: center">February 2017</h2>
+
+	<h2>Supported Platforms</h2>
+	<blockquote>
+		<h3>Runtime</h3>
 		<ul>
-			<li>Character controllers cannot stand on dynamic triggers anymore.</li>
+			<li>Apple iOS</li>
+			<li>Apple Mac OS X</li>
+			<li>Google Android ARM (version 2.2 or later required for SDK, 2.3 or later required for snippets)</li>
+			<li>Linux (tested on Ubuntu)</li>
+			<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
+			<li>Microsoft XBox One</li>
+			<li>Nintendo Switch</li>
+			<li>Sony Playstation 4</li>
 		</ul>
-		<li>Added:</li>
-		<ul>
-			<li>added lockingEnabled parameter to PxCreateControllerManager(), to support thread-safe release of objects while the character controller's move() routine is executing.</li>
-		</ul>
-	</ul>
-
-	<h3>Scene Queries</h3>
-	<ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>Raycasts against triangle meshes with large scales potentionaly failed to register a hit.</li>
-			<li>Overlaps against height field meshes with the flag eNO_BOUNDARY_EDGES potentionaly failed to register a hit.</li>
-			<li>Sweeps using shapes modelled around a space significantly offset from their geometric center could fail to register a hit. </li>
-		</ul>
-	</ul>
-
-	<h3>Vehicles</h3>
-	<ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>Sticky tire friction was unreliable with more than one substep but is now fixed.  This defect led to vehicles sliding down slopes where the sticky friction should have held firm.</li>
-			<li>An error in the jounce speed calculation that led to lift force at high forward speeds has been fixed.  This defect led to instabilities at high speed.</li>
-			<li>Improved documentation for PxVehicleSuspsensionData::mSprungMass.</li>
-		</ul>
-	</ul>
-
-	<h3>Cooking</h3>
-	<ul>
-		<li>Fixed:</li>
+		<h3>Development</h3>
 		<ul>
-			<li>Convex meshes generated from PhysX 3.2 were not able to load inside PhysX 3.3.</li>
+			<li>Microsoft Windows XP or later</li>
+			<li>Microsoft Visual Studio 2012, 2013, 2015</li>
+			<li>Xcode 8.2</li>
 		</ul>
-	</ul>
-
-	</blockquote>
-
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.2</h1>
-<h2 style="TEXT-ALIGN: center">September 2014</h2>
-
-	<h2>Supported Platforms</h2>
-	<blockquote>
-    <h3>Runtime</h3>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android ARM & x86 (version 2.2 or later required for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-		<li>Microsoft XBox One (SDK only, no samples)</li>
-        <li>Microsoft XBox 360</li>
-		<li>Nintendo Wii U</li>
-        <li>Sony Playstation 3</li>
-		<li>Sony Playstation 4 (SDK only, no samples)</li>
-		<li>Sony Playstation Vita</li>
-    </ul>
-    <h3>Development</h3>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010, 2012 (Windows RT only)</li>
-        <li>Xcode 4.6|5.0|5.0.1</li>
-    </ul>
 	</blockquote>
 	<h2>Known Issues</h2>
 	<blockquote>
-	<ul>
-		<li>The combination of releasing an actor and reassigning the actors of any affected joint so that the joint no longer references the released actor will lead to a crash if these operations are performed as buffered calls ie after simulate but before fetchResults.</li>
-	</ul>
+		<ul></ul>
 	</blockquote>
 	<h2>Changes and Resolved Issues</h2>
 	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
 
-	<h3>General</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>The PhysXCommon/64.dll, nvcuda.dll and PhysXUpdateLoader/64.dll are loaded and checked for the NVIDIA Corporation digital signature. The signature is expected on all NVIDIA Corporation provided dlls. The application will exit if the signature check fails.</li>
-			<li>Added the PxDefaultBufferedProfiler extension for simplified SDK profile events extraction.</li>
-			<li>PxSceneDesc::sanityBounds allows a bounding box to be set for validating the position coordinates of inserted or updated rigid actors and articulations.</li>
-			<li>Linux: Now supports GPU PhysX.</li>
-			<li>Added set/getRunProfiled() for PxDefaultCpuDispatcher to control profiling at task level.</li>
-			<li>Android: Support for x86 based devices was added.</li>
-			<li>PxProfileEventHandler::durationToNanoseconds() added. Translates event duration in timestamp (cycles) into nanoseconds.</li>
-			<li>Added SnippetProfileZone to show how to retrieve profiling information.</li>
-			<li>Added SnippetCustomJoint to better illustrate custom joint implementation, and removed SnippetExtension.</li>
-			<li>Added SnippetStepper to demonstrate kinematic updates while substepping with tasks.</li>
-		</ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>PxTask::runProfiled() now takes threadId as a parameter.</li>
-			<li>The static pruner now issues a performance warning in debug and checked configurations when a tree rebuild occurs and the tree is not empty.</li>
-			<li>PxSceneDesc::staticStructure now defaults to PxPruningStructure::eDYNAMIC_AABB_TREE.</li>
-			<li>Linux: Switched to shared libraries.</li>
-			<li>Profile zone event names changed to match function calls.</li>
-			<li>Overlapping read/write errors will now issue a PxErrorCode::eINVALID_OPERATION rather than PxErrorCode::eDEBUG_INFO.</li>
-			<li>Improved SnippetToleranceScale to better demonstrate the intended use case.</li>
-			<li>Increased 126 characters limit for warnings on unix platforms, 1k limit on all platforms.</li>
-			<li>PhysXCommon dll load within PhysX dll now respects dll name. Please see the manual's PhysXCommon DLL load section.</li>
-			<li>Significant revision of the user's guide.  Both structure and most content have been modified.</li>
-			<li>Fixed search function of user's guide.</li>
-			<li>Foundation math classes now have in-place arithmetic operators (+= etc).</li>
-			<li>PxScene::checkResults() no longer counts as a read API method. Hence it is now possible to call this method in blocking mode without causing all writes to block until it returns.</li>
-		</ul>
-		<li>Deprecated:</li>
+		<h3>General</h3>
 		<ul>
-			<li>Indexing operators taking signed integers in PxVec3, PxVec4, PxMat33, PxMat44, PxStrideIterator have been deprecated.</li>
+			<li>Added:</li>
+			<ul>
+				<li>nbAggregates, nbArticulations, nbDiscreteContactPairsTotal, nbDiscreteContactPairsWithCacheHits and nbDiscreteContactPairsWithContacts have been added to PxSimulationStatistics.</li>
+				<li>PxSceneLimits::maxNbBroadPhaseOverlaps has been added.</li>
+				<li>A new midphase has been added. See PxMeshMidPhase enum for details.</li>
+				<li>Midphase descriptor has been added. See PxMidphaseDesc for details.</li>
+				<li>PxHeightField::getTimestamp() has been added</li>
+				<li>PxCloneShape has been added to enable cloning of shapes.</li>
+				<li>acquireReference() has been added to increment the reference count of shapes, materials, triangle meshes, convex meshes heightfields and cloth fabrics.</li>
+				<li>The global filter shader data block can now be set through PxScene::setFilterShaderData().</li>
+				<li>PxGpuLoadHook and PxSetPhysXGpuLoadHook has been added to support loading of GPU dll with a different name than the default.</li>
+				<li>A new profile zone has been added for debug visualization.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>PxMathUtils.h has moved from include/common to include/foundation</li>
+				<li>GPU support requires SM2.x (Fermi architecture, GeForce 400 series) or later hardware. SM1.x is no longer supported.</li>
+				<li>Windows XP 64-bit and Windows Vista no longer support GPU acceleration. Windows XP 32-bit still supports GPU acceleration.</li>
+				<li>PxTaskManager::createTaskManager requires error callback and does not accept SPU task manager.</li>
+				<li>PxCreatePhysics and  PxCreateBasePhysics now take an optional pointer to an  physx::PxPvd instance.  </li>
+				<li>PxConstraintConnector::updatePvdProperties now takes an optional pointer to an physx::pvdsdk::PvdDataStream instance.</li>
+				<li>PxInitExtensions now takes an optional pointer to an physx::PxPvd instance.</li>
+				<li>Shared objects (triangle mesh, convex mesh, heightfield, material, shape, cloth fabric) no longer issue an eUSER_RELEASE event when their release() method is called</li>
+				<li>
+					PxBase::isReleasable() is now a property only of the type of an object, not the object state. In particular,
+					isReleasable() returns true for PxShape objects whose only counted reference belongs to their owning actor.
+				</li>
+				<li>PxCollection::releaseObjects() now calls release() even on shapes whose only counted reference belongs to their owning actor. An optional parameter releaseExclusiveShapes, which defaults to true, has been added to this method to assist with common scenarios in which all shapes are created with the deprecated method PxRigidActor::createShape() or its replacement PxRigidActorExt::createExclusiveShape()</li>
+				<li>Negative mesh scale is now supported for PxTriangleMeshGeometry. Negative scale corresponds to reflection and scale along the corresponding axis. In addition to reflection PhysX will flip the triangle normals.</li>
+				<li>PxDelayLoadHook is now inherited from PxFoundationDelayLoadHook. PxFoundation dll and PxPvdSDK dll are now delay loaded inside the SDK, their names can be provided through the delay load hook.</li>
+			</ul>
+			<li>Removed:</li>
+			<ul>
+				<li>Sony Playstation 3 is not supported any longer. Any related APIs have been removed.</li>
+				<li>Microsoft XBox 360 is not supported any longer. Any related APIs have been removed.</li>
+				<li>Nintendo Wii U is not supported any longer. Any related APIs have been removed.</li>
+				<li>Sony Playstation Vita is not supported any longer. Any related APIs have been removed.</li>
+				<li>Visual Studio 2010 is not supported any longer.</li>
+				<li>Microsoft Windows RT is not supported any longer.</li>
+				<li>Google Android X86 is not supported any longer.</li>
+				<li>PhysX Samples are not supported anymore except on Microsoft Windows.</li>
+				<li>Linux 32-bit no longer support GPU acceleration. Linux 64-bit still supports GPU acceleration.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxScene::setFlag() does now properly send error messages in CHECKED builds if a non-mutable scene flag gets passed in.</li>
+				<li>Fixed a bug in force threshold based contact reports, which caused events to be lost.</li>
+				<li>Fixed a bug in aggregates that led to a crash when rigid bodies are added to an aggregate after removing all rigid bodies from an aggregate. This only occurred with aggregates that were added to the scene and with rigid bodies that had shapes attached.</li>
+				<li>Fixed a bug where non-breakable joints could break, leading to a crash. </li>
+				<li>Fixed RepX load of kinematic rigid bodies with mesh shapes.</li>
+				<li>Fixed a divide by zero bug in SIMD distanceSegmentTriangle function.</li>
+				<li>Debug visualization for compound bounds (PxVisualizationParameter::eCOLLISION_COMPOUNDS) now works even when all the compounds' shapes have their debug viz flag (PxShapeFlag::eVISUALIZATION) disabled.</li>
+				<li>Double-buffering now works properly for the debug visualization culling box.</li>
+				<li>The default debug visualization culling box now works correctly with heightfields.</li>
+				<li>Rare crashes in PxShape::setGeometry() and PxShape::getMaterials() have been fixed.</li>
+				<li>A crash happening when doing an origin shift on a scene containing an empty aggregate has been fixed.</li>
+			</ul>
+			<li>Deprecated:</li>
+			<ul>
+				<li>PxSceneLimits::maxNbObjectsPerRegion has been deprecated. It is currently not used.</li>
+				<li>PxComputeHeightFieldPenetration has a new signature, and the old one has been deprecated</li>
+				<li>PxComputeMeshPenetration has been deprecated. Use PxComputeTriangleMeshPenetration.</li>
+				<li>The PhysX particle feature has been deprecated.</li>
+				<li>PxTolerancesScale::mass has been deprecated.  It is currently not used.</li>
+				<li>PxActorClientBehaviorFlag has been marked as deprecated and will be removed in future releases.</li>
+			</ul>
+			<li>Removed deprecated API:</li>
+			<ul>
+				<li>PxPairFlag::eCCD_LINEAR removed. Use PxPairFlag::eDETECT_CCD_CONTACT | PxPairFlag::eSOLVE_CONTACT instead.</li>
+				<li>PxPairFlag::eRESOLVE_CONTACTS removed. Use PxPairFlag::eDETECT_DISCRETE_CONTACT | PxPairFlag::eSOLVE_CONTACT instead.</li>
+				<li>PxTriangleMeshFlag::eHAS_16BIT_TRIANGLE_INDICE removed. Use PxTriangleMeshFlag::e16_BIT_INDICES instead.</li>
+				<li>PxTriangleMeshFlag::eHAS_ADJACENCY_INFO removed. Use PxTriangleMeshFlag::eADJACENCY_INFO instead.</li>
+				<li>PxTriangleMeshDesc::convexEdgeThreshold removed.</li>
+				<li>PxSceneQueryFlag renamed to PxHitFlag.</li>
+				<li>PxHitFlag::eDIRECT_SWEEP renamed to PxHitFlag::ePRECISE_SWEEP.</li>
+				<li>PxHitFlag::eIMPACT renamed to PxHitFlag::ePOSITION.</li>
+				<li>PxSceneQueryHitType renamed to PxQueryHitType.</li>
+				<li>PxSceneQueryCache renamed to PxQueryCache.</li>
+				<li>PxSceneQueryFilterFlag renamed to PxQueryFlag.</li>
+				<li>PxSceneQueryFilterFlags renamed to PxQueryFlags.</li>
+				<li>PxSceneQueryFilterData renamed to PxQueryFilterData.</li>
+				<li>PxSceneQueryFilterCallback renamed to PxQueryFilterCallback.</li>
+				<li>PxScene::raycastAll,PxScene::raycastSingle,PxScene::raycastAny replaced by PxScene::raycast.</li>
+				<li>PxScene::overlapAll,PxScene::overlapAny replaced by PxScene::overlap.</li>
+				<li>PxScene::sweepAll,PxScene::sweepSingle,PxScene::sweepAny replaced by PxScene::sweep.</li>
+				<li>PxQuat, PxTranform, PxMat33, PxMat44 createIdentity and createZero removed. Use PxIdentity, PxZero in constructor.</li>
+				<li>PxJointType::Enum, PxJoint::getType() removed. Use PxJointConcreteType instead.</li>
+				<li>PxVisualDebugger removed. Use PxPvd instead.</li>
+				<li>PxControllerFlag renamed to PxControllerCollisionFlag.</li>
+				<li>PxCCTHit renamed to PxControllerHit.</li>
+				<li>PxCCTNonWalkableMode renamed to PxControllerNonWalkableMode.</li>
+				<li>PxControllerNonWalkableMode::eFORCE_SLIDING changed to PxControllerNonWalkableMode::ePREVENT_CLIMBING_AND_FORCE_SLIDING.</li>
+				<li>PxControllerDesc::interactionMode, groupsBitmask, callback removed.</li>
+				<li>PxController::setInteraction, getInteraction, setGroupsBitmask, getGroupsBitmask removed.</li>
+				<li>PxControllerManager::createController no longer needs PxPhysics and PxScene.</li>
+				<li>PxControllerFilters::mActiveGroups replaced with PxControllerFilters::mCCTFilterCallback.</li>
+				<li>PxSerialization::createBinaryConverter(PxSerializationRegistry&) changed to PxSerialization::createBinaryConverter().</li>
+				<li>PxConstraintDominance renamed to PxDominanceGroupPair.</li>
+				<li>PxScene::flush renamed to PxScene::flushSimulation.</li>
+				<li>PxClothFabric::getPhaseType removed.</li>
+				<li>PxCollection::addRequired removed.</li>
+				<li>PxRigidActor::createShape discontinued support for initial transform.</li>
+				<li>PxShape::resetFiltering removed.</li>
+				<li>PxParticleBase::resetFiltering removed.</li>
+				<li>PxSceneDesc::meshContactMargin removed.</li>
+				<li>PxSceneDesc::contactCorrelationDistance removed.</li>
+				<li>Indexing operators taking signed integers in PxVec3, PxVec4, PxMat33, PxMat44, PxStrideIterator have been removed.</li>
+				<li>PxHitFlag::ePOSITION, PxHitFlag::eDISTANCE and PxHitFlag::eNORMAL are now supported in PxMeshQuery::sweep function.</li>
+				<li>PxClothFlag::eGPU renamed to PxClothFlag::eCUDA.</li>
+				<li>PxActorTypeSelectionFlag/PxActorTypeSelectionFlags. Use PxActorTypeFlag/PxActorTypeFlags instead.</li>
+				<li>PxConstraintFlag::eDEPRECATED_32_COMPATIBILITY flag removed.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Rigid Bodies</h3>
-	<ul>
-		<li>Fixed:</li>
+		<h3>PxShared</h3>
 		<ul>
-			<li>A minor bug in contact generation between capsules and triangle meshes has been fixed, reducing the amount of tunneling cases when CCD is not used.</li>
-            <li>Discrete contact reports are no longer produced for pairs without PxPairFlag::eDETECT_DISCRETE_CONTACT raised in the filter shader. Previously, discrete contact generation would always have been performed regardless of the presence of the PxPairFlag::eDETECT_DISCRETE_CONTACT flag. This change potentially improves performance when using  specific shapes for CCD-only collision, which would have previously generated discrtete contacts and then ignored them in the solver. </li>
-		    <li>Trigger reports are no longer produced for pairs without PxPairFlag::eDETECT_DISCRETE_CONTACT raised in the filter shader. PxPairFlag::eTRIGGER_DEFAULT has been modified to include the PxPairFlag::eDETECT_DISCRETE_CONTACT flag.</li>
-			<li>An incorrect PX_DEPRECATED annotation on the default constructor for PxD6JointDrive has been removed.</li>
-			<li>PxRigidDynamic::getKinematicTarget() returned a wrong transform if the actor center of mass pose was different from the actor global pose.</li>
-			<li>Switching a PxRigidDynamic from dynamic to kinematic did not properly suppress existing pairs which turned into kinematic-kinematic or kinematic-static pairs.</li>
-			<li>PxRigidDynamic::isSleeping() did not return the correct value on the frame the object got inserted if PxScene::addActors() was used and if the object was awake.</li>
-			<li>PxSceneFlag::eDISABLE_CONTACT_CACHE now correctly works on PS3/SPU.</li>
-			<li>If an object was added to the scene, put asleep and had overlap with another sleeping object then contact points for that pair might not have been reported once the object woke up.</li>
-			<li>Potential crash when calling PxScene::resetFiltering() mulitple times for the same actor while the simulation was running has been fixed.</li>
-			<li>Potential crash when using PxScene::resetFiltering() with shapes that were just added while the simulation was running has been fixed.</li>
-			<li>A crash in MBP when deleting an object that just went out of broad-phase bounds has been fixed.</li>
-			<li>A new drive mode has been added to drive articulation joints using rotation vectors.</li>
-			<li>In contact and trigger reports, the shape references in PxTriggerPair and PxContactPair might not have been properly marked as removed shapes if the removal took place while the simulation was running.</li>
-			<li>PxPairFlag::eSOLVE_CONTACT is now properly observed if the flag is not set on a contacting pair. A consequence of this fix is that sleeping actors will no longer be woken up due to contact or lost contact with awake actors if PxPairFlag::eSOLVE_CONTACT is not set for the pair.  This also affects kinematic-kinematic pairs if one kinematic of the pair moves out of contact with the other.  Behaviour is unaffected for any pair that has PxPairFlag::eSOLVE_CONTACT set.</li>
-			<li>A memory leak with buffered shape and actor removal has been fixed.  The memory leak occurred when the release of an actor's shapes was followed by the release of the actor, all in-between simulate() and fetchResults().</li>
-			<li>A bug was fixed which caused incorrect force reports to sometimes be reported.</li>
-			<li>Fixed a bug where incorrect normals were reported when using PCM contact gen.</li>
-			<li>Fixed some issues with scaled convex hulls in the PCM contact gen code path.</li>
-			<li>The accuracy of capsule collision code has been improved.</li>
-			<li>An isValid() method has been added to constraints, that is satisfied if and only if at least one actor is a dynamic body or articulation link</li>
-			<li>A check has been added to prevent constraint construction or modification that would leave the constraint invalid.</li>
-			<li>In checked builds the PxScene methods addActor(), addActors(), addAggregate() and addCollection() will warn and return if an invalid constraint would be added to the scene</li>
+			APEX 1.4 can now be used independently of PhysX. In order to achieve that a new shared code base was created called "PxShared". PhysX functionality such as common types, PxFoundation, the task infrastructure are now part of PxShared.
 		</ul>
-		<li>Deprecated:</li>
+
+		<h3>Rigid Bodies</h3>
 		<ul>
-			<li>The following flags have been renamed and deprecated because the name did not properly reflect the root cause.</li>
+			<li>Added:</li>
 			<ul>
-				<li>PxContactPairHeaderFlag</li>
-				<ul>
-					<li>eDELETED_ACTOR_0 (use eREMOVED_ACTOR_0 instead)</li>
-					<li>eDELETED_ACTOR_1 (use eREMOVED_ACTOR_1 instead)</li>
-				</ul>
-				<li>PxContactPairFlag</li>
-				<ul>
-					<li>eDELETED_SHAPE_0 (use eREMOVED_SHAPE_0 instead)</li>
-					<li>eDELETED_SHAPE_1 (use eREMOVED_SHAPE_1 instead)</li>
-				</ul>
-				<li>PxTriggerPairFlag</li>
-				<ul>
-					<li>eDELETED_SHAPE_TRIGGER (use eREMOVED_SHAPE_TRIGGER instead)</li>
-					<li>eDELETED_SHAPE_OTHER (use eREMOVED_SHAPE_OTHER instead)</li>
-				</ul>
+				<li>An alternative simulation API has been introduced. This makes use of the following new functions: PsScene:collide(), PxScene::fetchCollision() and PxScene::advance(). Expected usage of these functions is illustrated in a new snippet SnippetSplitSim. This feature is also described in the manual in Section Simulation->SplitSim.</li>
+				<li>PxSceneFlag::eDEPRECATED_TRIGGER_TRIGGER_REPORTS has been introduced to re-enable the legacy behavior of trigger shape pairs sending reports. This flag and the corresponding legacy behavior will be removed in version 4.</li>
+				<li>The active actors feature has been added. See PxSceneFlag::eENABLE_ACTIVE_ACTORS.</li>
+				<li>Functionality to compute and manipulate mass, inertia tensor and center of mass of objects has been exposed in the new class PxMassProperties.</li>
+				<li>The option to exclude kinematics from the active actors/transforms list has been added. See PxSceneFlag::eEXCLUDE_KINEMATICS_FROM_ACTIVE_ACTORS for details.</li>
+				<li>The integrated pose of dynamic rigid bodies can be accessed earlier in the pipeline through a new callback (see the API documentation for PxSimulationEventCallback::onAdvance() and PxRigidBodyFlag::eENABLE_POSE_INTEGRATION_PREVIEW for details).</li>
+				<li>PxConvexMeshGeometryFlag::eTIGHT_BOUNDS has been added. See the user manual for details.</li>
+				<li>New split fetchResults methods introduced, see PxScene::fetchResultsBegin(), PxScene::fetchResultsFinish() and PxScene::processCallbacks(). This is intended to permit the application to parallelize the event notification callbacks.</li>
+				<li>New flag introduced to suppress updating scene query pruner trees inside fetchResults, see PxSceneFlag::eSUPPRESS_EAGER_SCENE_QUERY_REFIT. Instead, pruners will be updated during the next query.</li>
+				<li>Introduced GPU rigid body simulation support, see PxSceneFlag::eENABLE_GPU_DYNAMICS. GPU rigid body support requires SM3.0 or later.</li>
+				<li>Introduced a new GPU-accelerated broad phase. See PxBroadPhaseType::eGPU. GPU broad phase support requires SM3.0 or later.</li>
+				<li>Introduced a new enhanced determinism mode. See PxSceneFlag::eENABLE_ENHANCED_DETERMINISM. This provides additional levels of rigid body simulation determinism at the cost of some performance.</li>
+				<li>Introduced a new speculative contacts CCD approach. See PxRigidBodyFlag::eENABLE_SPECULATIVE_CCD. This is a slightly cheaper, less robust solution to PxRigidBodyFlag::eENABLE_CCD. There is no need to turn CCD on the scene using PxSceneFlag::eENABLE_CCD or enable PxPairFlag::eDETECT_CCD_CONTACT with this CCD mode as it functions as an extension to the discrete time-stepping scheme. This form of CCD can be enabled on kinematic actors. </li>
+				<li>New "immediate mode" API has been added. This exposes access to the low-level contact generation and constraint solver, which allows the application to use these PhysX low-level components to perform its own simulations without needing to populate and simulate a PxScene.</li>
+				<li>RigidDynamic lock flags added which permit the application to disallow rotation/translation of PxRigidDynamics around specific axes.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>PxRigidStatic objects can now have zero shapes while being part of a scene.</li>
+				<li>PxContactPairFlag::eINTERNAL_HAS_FACE_INDICES is obsolete and has been removed.</li>
+				<li>PxConstraintFlag::eDEPRECATED_32_COMPATIBILITY was previously only implemented for spring constraints. It is now correctly implemented for equality constraints.</li>
+				<li>PxSceneFlag::eENABLE_PCM is enabled by default. This means PhysX uses PCM distance-based collision detection by default. </li>
+				<li>Calls to PxRigidDynamic::setWakeCounter() following PxScene::collide() do now explicitly get taken into account in the subsequent call to PxScene::advance().</li>
+				<li>Calls to contact modification callbacks can be made from multiple threads simultaneously. Therefore, modification callbacks should must be thread-safe. </li>
+				<li>Unified heightfield contact generation is now the default heightfield contact generation approach. This approach offers similar performance and behavior to contact generation with triangle meshes. Unified heightfields have no thickness because contact genreation operates on triangles so objects may tunnel if CCD is not enabled.</li>
+				<li>When unified heightfield contact generation is in use, the bounds of heightfield shapes are no longer extruded by "thickness".</li>
+				<li>PxArticulationJoint::setTwistLimit and PxArticulationJoint::getTwistLimit were incorrectly documented with zLimit and yLimit in the wrong order.  The behavior of both functions remains unchanged but now they are correctly documented with zLimit and yLimit in the correct order. This is simply a clarification of the existing function behavior.</li>
+			</ul>
+			<li>Removed:</li>
+			<ul>
+				<li>The deprecated class PxFindOverlapTriangleMeshUtil has been removed. Please use PxMeshOverlapUtil instead.</li>
+				<li>The deprecated flag PxConstraintFlag::eREPORTING has been removed. Force reports are now always generated.</li>
+				<li>The following deprecated simulation event flags have been removed: PxContactPairHeaderFlag::eDELETED_ACTOR_0, ::eDELETED_ACTOR_1, PxContactPairFlag::eDELETED_SHAPE_0, ::eDELETED_SHAPE_1, PxTriggerPairFlag::eDELETED_SHAPE_TRIGGER, ::eDELETED_SHAPE_OTHER. Please use the following flags instead: PxContactPairHeaderFlag::eREMOVED_ACTOR_0, ::eREMOVED_ACTOR_1, PxContactPairFlag::eREMOVED_SHAPE_0, ::eREMOVED_SHAPE_1, PxTriggerPairFlag::eREMOVED_SHAPE_TRIGGER, ::REMOVED_SHAPE_OTHER.</li>
+				<li>The deprecated method PxPhysics::createHeightField(const PxHeightFieldDesc&) has been removed. Please use PxCooking::createHeightField(const PxHeightFieldDesc&, PxPhysicsInsertionCallback&) instead. The insertion callback can be obtained through PxPhysics::getPhysicsInsertionCallback().</li>
+			</ul>
+			<li>Deprecated:</li>
+			<ul>
+				<li>PxRigidActor::createShape() has been deprecated in favor of PxRigidActorExt::createExclusiveShape()</li>
+				<li>Trigger notification events for trigger-trigger pairs have been deprecated and will be omitted by default. See the 3.4 migration guide for more information.</li>
+				<li>The active transforms feature (PxSceneFlag::eENABLE_ACTIVETRANSFORMS) has been deprecated. Please use PxSceneFlag::eENABLE_ACTIVE_ACTORS instead. </li>
+				<li>PxRegisterHeightFields has been modified to register unified heightfields, which are now the default implementation. PxRegisterLegacyHeightFields() has been added to register the legacy (deprecated) heightfield contact gen approach.</li>
+				<li>PxHeightFieldDesc::thickness has been deprecated, as the new unified height field (see PxRegisterUnifiedHeightFields()) does not support thickness any longer.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>The capsule-vs-heightfield contact generation had a rare bug where a vertical capsule standing exactly on a shared edge could fall through a mesh. This has been fixed.</li>
+				<li>The bounding box of a shape was not always properly updated when the contact offset changed.</li>
+				<li>Fixed a bug in the GJK sweep caused by lost precision in the math</li>
+				<li>Calls to PxScene::shiftOrigin() can crash when PxRigidDynamic actors with PxActorFlag::eDISABLE_SIMULATION are present.</li>
+				<li>Fixed a bug when PxShape::setMaterials was called with less materials than shape had before.</li>
+				<li>Fixed a bug in CCD that could lead to a hang in the simulation.</li>
+				<li>Fixed a bug in PCM mesh edge-edge check for the parallel case. </li>
+				<li>Fixed a bug in CCD where contact modify callbacks could be called when the CCD did not detect a contact.</li>
+				<li>Fixed a bug with applying external force/torque to a body in buffered insertion stage.</li>
+				<li>A rare capsule-vs-mesh contact generation bug has been fixed.</li>
+				<li>A rare crash due to an invalid assert in the MBP broad-phase has been fixed. This error only affected debug & checked builds; release & profile builds were unaffected.</li>
 			</ul>
 		</ul>
-	</ul>
 
-	<h3>Vehicles</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Particles</h3>
 		<ul>
-			<li>In profile config the functions PxVehicleUpdates, PxVehiclePostUpdates and PxVehicleSuspensionRaycasts are now tracked with profile events (provided that a PxProfileZoneManager instance was passed to PxCreatePhysics).  These profile events can be viewed in profile view in pvd, where the names of the profile events match the names of the tracked vehicle functions.</li>
+			<li>Gpu: Maxwell Optimizations</li>
+			<li>The PhysX particle feature has been deprecated.</li>
+			<li>Fixed particle collision issue with PxParticleBaseFlag::ePER_PARTICLE_COLLISION_CACHE_HINT (on by default). When particles collided against very dense triangle mesh areas an assert would be triggered or particles would leak through the triangle mesh. A workaround was to disable PxParticleBaseFlag::ePER_PARTICLE_COLLISION_CACHE_HINT.</li>
 		</ul>
-		<li>Fixed:</li>
+
+		<h3>Cloth</h3>
 		<ul>
-			<li>In checked config PxVehicleDriveTank::allocate enforces the rule that only tanks with even numbers of wheels are legal and warns when this rule is broken.</li>
-			<li> In checked config PxVehicleDrive4W::allocate enforces the rule that only vehicles with 4 wheels or more are legal and warns when this rule is broken.</li>
-			<li>PxWheelQueryResult::localPose was incorrectly only set when the vehicle had a corresponding PxShape, as described by PxVehicleWheelsSimData::setWheelShapeMapping.  The localPose is now properly set independent of the mapping between wheel and shape.</li>
-			<li>Wheels resting on moving actors now properly observe the relative speed of the two actors when their relative speed is small.  This fixes a bug where at small relative speeds the velocity of the other actor was assumed to be zero.</li>
-			<li>Repx serialization failed to serialize PxVehicleWheelsSimData::setMinLongSlipDenominator, PxVehicleWheelsSimData::setSubStepCount, PxVehicleWheelsSimData::disableWheel, PxVehicleWheelsSimData::enableWheel and the number of entries in the engine torque curve.  These have now been fixed.</li>
-			<li>PxVehicleConcreteType used for vehicle serialization is now in the public API and has been moved to PxVehicleSDK.h.</li>
-			<li>Very small longitudinal and lateral slip angles could lead to numerical instabilities in some circumstances.  A threshold has been introduced to reject very small slip angles by setting them to zero when they are below the threshold.</li>
-			<li>Vehicles now account for rigid bodies that have been given a zero inertia component in order to lock rotation around the corresponding axis.</li>
-			<li>Fixed a bug where the sticky wheel constraints sometimes didn't function correctly.</li>
+			<li>Continuous collision (PxClothFlag::eSWEPT_CONTACT) behavior has been optimized to reduce cloth sticking to collision shape.</li>
+			<li>Added air resistance feature (see PxCloth::setWindVelocity(PxVec3), PxCloth::setWindDrag(PxReal), PxCloth::setWindLift(PxReal), PxClothFabricDesc::nbTriangles, PxClothFabricDesc::triangles.</li>
 		</ul>
-	</ul>
 
-	<h3>Cloth</h3>
-	<ul>
-		<li>Fixed:</li>
+		<h3>Serialization</h3>
 		<ul>
-			<li>The version number written to the fabric stream changed from PX_PHYSICS_VERSION to 1. A fabric can be created from streams written with version 3.3.0 and later until the stream format changes. Previously, the version of the producer and the consumer of the stream needed to match.</li>
-			<li>GPU cloth friction against convexes has been fixed.</li>
-			<li>A crash resulting from deleting a shape in proximity of a cloth with scene collision enabled has been fixed.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxTriangleMesh instances with adjacency information were not correctly initialized when created with cooking.createTriangleMesh. This caused a crash when converting the binary serialized triangle mesh data. </li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Scene Queries</h3>
-	<ul>
-		<li>Fixed:</li>
+		<h3>Character controller</h3>
 		<ul>
-			<li>PxMeshQuery::sweep now respects PxHitFlag::eMESH_BOTH_SIDES, and supports double-sided input triangles.</li>
-			<li>PxRigidBodyExt::linearSweepSingle and PxRigidBodyExt::linearSweepMultiple now correctly use query filter data instead of simulation filter data if filter data is not provided.</li>
-			<li>The spec for raycasts whose origin is located inside a solid object (sphere, box, capsule, convex) has changed back to what it was in 3.3.0. It was accidentally changed in 3.3.1. See the manual for details.</li>
-			<li>Convex sweeps against heightfields worked only when the heightfield had the identity transform.  This has now been fixed to support arbitrary transforms again.</li>
+			<li>Added:</li>
+			<ul>
+				<li>Profile zones have been added for the character controller.</li>
+				<li>Added PxControllerDesc::registerDeletionListener boolean defining if deletion listener for CCT should be registered.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>Character controllers cannot stand on dynamic triggers anymore.</li>
+				<li>Fixed: the capsule-vs-sphere sweep now returns a normal in the correct direction.</li>
+				<li>Fixed a bug where CCT shapes initially overlapping static geometry would be moved down by an incorrect amount (the length of the step offset).</li>
+				<li>The overlap recovery module now works against kinematic objects.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Cooking</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Vehicles</h3>
 		<ul>
-			<li>Using PxMeshPreprocessingFlag::eFORCE_32BIT_INDICES will always cook meshes with 32-bit triangle indices.</li>
+			<li>Added:</li>
+			<ul>
+				<li>
+					Anti-roll suspension has been added.	The class PxVehicleAntiRollBar and the functions PxVehicleWheelsSimData::setAntiRollBar,
+					PxVehicleWheelsSimData::getAntiRollBar, PxVehicleWheelsSimData::getNbAntiRollBars allow anti-roll bars to be configured and queried.
+				</li>
+				<li>
+					A new function PxVehicleSuspensionSweeps has been introduced.  This sweeps the PxShape that represents the wheel along the suspension direction. The hit planes resulting
+					from the sweep are used as driving surfaces similar to those found by PxVehicleSuspensionRaycasts.
+				</li>
+				<li>
+					A new snippet SnippetVehicleContactMod has been added.  This snippet demonstrates how to use sweeps and contact modification to allow the wheel's volume to fully interact
+					with the environment.
+				</li>
+				<li>A new function PxVehicleModifyWheelContacts has been introduced.  This function analyses contact in the contact modification callback and rejects contact points that represent drivable surfaces.</li>
+				<li>A new function PxVehicleSetMaxHitActorAcceleration has been introduced.  This function sets the maximum acceleration experienced by a PxRigidDynamic that finds itself under the wheel of a vehicle.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>
+					In checked build the functions PxVehicleDrive4W::allocate, PxVehicleDriveNW::allocate, PxVehicleDriveTank::allocate, PxVehicleNoDrive::allocate all return NULL and issue a warning if called before
+					PxInitVehicleSDK.
+				</li>
+				<li>Tire width is no longer accounted for when computing the suspension compression from raycasts (PxVehicleSuspensionRaycasts).  Instead, tire width is incorporated into the suspension compression arising from swept wheels (PxVehicleSuspensionSweeps).  It is recommended to use PxVehicleSuspensionSweeps if there is a strict requirement that the inside and outside of the wheel don't visibly penetrate geometry.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>Suspension force calculation now applies an extra force perpendicular to the spring travel direction.	This force is calculated to satisfy the constraint that the sprung mass only has motion along the spring travel direction.	This change mostly affects vehicles with suspension travel directions that are not vertical.</li>
+				<li>PxVehicleWheelsSimData::mThresholdLongitudinalSpeed and PxVehicleWheelsSimData::mMinLongSlipDenominator are now given default values that reflect the length scale set in PxTolerancesScale.</li>
+				<li>Unphysically large constraint forces were generated to resolve the suspension compression beyond its limit when the suspension direction and the hit normal under the wheel approach perpendicularity.  This has been fixed so that the constraint force approaches zero as the angle between the hit normal and suspension direction approaches a right angle.</li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+
+		<h3>Scene queries</h3>
 		<ul>
-			<li>Convex hull cooking fix. Some input points could be ignored during cooking, fixed.</li>
-			<li>Inserted triangle meshes now respect 16 bit indices flag.</li>
+			<li>Added:</li>
+			<ul>
+				<li>PxPruningStructure was introduced as an optimization structure to accelerate scene queries against large sets of newly added actors.</li>
+				<li>PxScene::addActors(PxPruningStructure& ) has been added.</li>
+				<li>PxMeshQuery::sweep now supports PxHitFlag::eMESH_ANY.</li>
+				<li>PxHitFlag::eFACE_INDEX was introduced to reduce the perf cost for convex hull face index computation. In order to receive face index for sweeps against a convex hull, the flag PxHitFlag::eFACE_INDEX has to be set. Note that the face index can also be computed externally using the newly introduced method PxFindFaceIndex from the extensions library.</li>
+				<li>PxGeometryQuery::isValid was added to check provided geometry validity.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>Raycasts against triangle meshes with PxHitFlag::eMESH_MULTIPLE flag now return all hits, code for discarding hits close to each other has been removed.</li>
+				<li>PxPruningStructure enum has been renamed to PxPruningStructureType</li>
+			</ul>
+			<li>Deprecated:</li>
+			<ul>
+				<li>PxHitFlag::eDISTANCE has been deprecated.</li>
+				<li>The batched query feature has been deprecated.</li>
+				<li>Volume cache feature has been deprecated.</li>
+				<li>Spatial index feature has been deprecated.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxScene::sweep now properly implements PxHitFlag::eMESH_BOTH_SIDES (returned normal follows the same convention as for raycasts).</li>
+				<li>Raycasts against heightfields now correctly return multiple hits when PxHitFlag::eMESH_MULTIPLE flag is used.</li>
+				<li>PxSweepHit.faceIndex was computed incorrectly for sweep tests initially overlapping convex objects. The face index is now set to 0xffffffff in these cases.</li>
+				<li>Convex vs convex sweeps in PxGeometryQuery::sweep() do now correctly return the face index of the convex mesh that gets passed in as parameter geom1 (and not the one from geom0).</li>
+				<li>PxMeshQuery::sweep now supports PxHitFlag::eMESH_ANY.</li>
+				<li>Deprecated definition PxSceneQueryHit has been removed. Please use PxQueryHit instead.</li>
+				<li>PxGeometryQuery::computePenetration with convex geometries.</li>
+				<li>On Android platforms, the eDYNAMIC_AABB_TREE pruning structure could pass already released objects into the scene query filter callback.</li>
+			</ul>
+		</ul>
 
 
+		<h3>Cooking</h3>
+		<ul>
+			<li>Added:</li>
+			<ul>
+				<li>PxTriangleMeshCookingResult added, cookTriangleMesh now does return additional PxTriangleMeshCookingResult. Please see the manual for more information.</li>
+				<li>New convex hull generator added. It is now possible to switch between a new quickhull implementation and the legacy inflation based hull. Quickhull is the default algorithm.</li>
+				<li>Convex hulls can now be directly inserted in PxPhysics as triangle meshes and height fields.</li>
+				<li>A separate convex hull validation function has been added, it is now possible to create hulls without validation.</li>
+				<li>Convex hull generator vertex limit has two different algorithms - plane shifting and OBB slicing.</li>
+				<li>PxConvexFlag::eFAST_INERTIA_COMPUTATION added. When enabled, the inertia tensor is computed faster but with less precision.</li>
+				<li>PxConvexFlag::eGPU_COMPATIBLE added. When enabled convex hulls are created with vertex limit set to 64 and vertex limit per face is 32.</li>
+				<li>PxConvexFlag::eSHIFT_VERTICES added. When enabled input points are shifted to be around origin to improve computation stability.</li>
+				<li>PxCookingParams::gaussMapLimit has been added. The limit can now be fully user-defined. Please refer to the migration guide and best practices sections of the manual.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>The performance of convex creation from polygons has been improved.</li>
+			</ul>
+			<li>Deprecated:</li>
+			<ul>
+				<li>The PxPlatform enum and the PxGetGaussMapVertexLimitForPlatform function have been deprecated.</li>
+			</ul>
+			<li>Removed:</li>
+			<ul>
+				<li>The deprecated flags PxMeshPreprocessingFlag::eREMOVE_UNREFERENCED_VERTICES and ::eREMOVE_DUPLICATED_TRIANGLES have been removed. Meshes get cleaned up by default unless PxMeshPreprocessingFlag::eDISABLE_CLEAN_MESH is set.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>Mesh cooking was sometimes crashing for meshes with less than 4 triangles. This has been fixed.</li>
+				<li>Cooking convex mesh from a flat input mesh produced incorrect large mesh.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Geometry</h3>
-	<ul>
-		<li>Fixed:</li>
+		<h3>Extensions</h3>
 		<ul>
-			<li>PxHeightFieldDesc::thickness is now limited to [-PX_MAX_BOUNDS_EXTENTS, PX_MAX_BOUNDS_EXTENTS range]. (Previously unbounded).</li>
+			<li>Added:</li>
+			<ul>
+				<li>PxRaycastCCD has been added, to improve the visibility of the raycast-based CCD solution, which was previously only available in the Sample framework. This is a simpler and potentially cheaper alternative to the SDK's built-in continuous collision detection algorithm.</li>
+				<li>PxFindFaceIndex has been added. The function computes the closest polygon on a convex mesh from a given hit point and direction.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>Memory churn of PxDefaultMemoryOutputStream has been reduced.</li>
+				<li>The signatures for the PxComputeMeshPenetration and PxComputeHeightFieldPenetration functions have changed.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Particles</h3>
-	<ul>
-		<li>Fixed:</li>
+		<h3>Profiling</h3>
 		<ul>
-			<li>Setting PxParticleReadDataFlag::eREST_OFFSET_BUFFER on a PxParticleBase instance that was not created with the per particle rest offset option (see PxPhysics::createParticleSystem, PxPhysics::createParticleFluid and PxParticleBaseFlag::ePER_PARTICLE_REST_OFFSET) is not supported. The unsupported configuration may have resulted in crashes. The SDK now rejects this configuration on calling PxParticleBase::setParticleBaseFlag and issues an appropriate warning to the error stream.</li>
-			<li>Performance improvements on Kepler and above GPUs running SPH.</li>
-			<li>In rare cases particle systems could access released memory when all interactions with a rigid body shape were lost.</li>
+			<li>Changed:</li>
+			<ul>
+				<li>Profiling information is now available only in debug, checked and profile configuration.</li>
+				<li>PxProfileZoneManager::createProfileZoneManager now takes PxAllocatorCallback as input parameter instead of PxFoundation.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Serialization</h3>
-	<ul>
-		<li>Fixed:</li>
+		<h3>Physx Visual Debugger</h3>
 		<ul>
-			<li>PxBinaryConverter::convert potentially failed in checked mode with allocators that don't set 0xcd pattern. This has been fixed now.</li>
+			<li>PhysXVisualDebuggerSDK, PvdRuntime projects replaced with PxPvdSDK.</li>
+			<li>PxPvdSceneClient::drawPoints now takes physx::pvdsdk::PvdDebugPoint as input parameter instead of PxDebugPoint. drawLines, drawTriangles, drawText and so on.</li>
+			<li>The SDK's Debug visualization data is not sent to PVD anymore in ePROFILE mode.</li>
+			<li>PxPvdSceneFlag::eTRANSMIT_CONTACTS (instead of PxPvdSceneFlag::eTRANSMIT_CONSTRAINTS) was sometimes incorrectly used to control the transmission of constraint-related data. This has been fixed. In addition, the PxPvdSceneFlag flags are now consistently ignored when PxPvdInstrumentationFlag::eDEBUG is not set.</li>
 		</ul>
-	</ul>
 
-	</blockquote>
+		<h3>Aggregates</h3>
+		<ul></ul>
 
+		<h3>Snippets</h3>
+		<ul>
+			<li>Snippet profile zone has been removed.</li>
+		</ul>
+	</blockquote>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.1</h1>
-<h2 style="TEXT-ALIGN: center">December 2013</h2>
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.4</h1>
+	<h2 style="TEXT-ALIGN: center">October 2015</h2>
 
 	<h2>Supported Platforms</h2>
 	<blockquote>
-    <h3>Runtime</h3>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-		<li>Microsoft XBox One</li>
-        <li>Microsoft XBox 360</li>
-		<li>Nintendo Wii U</li>
-        <li>Sony Playstation 3</li>
-		<li>Sony Playstation 4 (SDK only, no samples)</li>
-		<li>Sony Playstation Vita</li>
-    </ul>
-    <h3>Development</h3>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010, 2012 (Windows RT only)</li>
-        <li>Xcode 4.6|5.0|5.0.1</li>
-    </ul>
-	</blockquote>
-	<h2>Known Issues</h2>
-	<blockquote>
-	<ul>
-	</ul>
+		<h3>Runtime</h3>
+		<ul>
+			<li>Apple iOS</li>
+			<li>Apple Mac OS X</li>
+			<li>Google Android ARM & x86 (version 2.2 or later required for SDK, 2.3 or later required for samples)</li>
+			<li>Linux (tested on Ubuntu)</li>
+			<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
+			<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
+			<li>Microsoft XBox One (SDK only, no samples)</li>
+			<li>Microsoft XBox 360</li>
+			<li>Nintendo Wii U</li>
+			<li>Sony Playstation 3</li>
+			<li>Sony Playstation 4 (SDK only, no samples)</li>
+			<li>Sony Playstation Vita</li>
+		</ul>
+		<h3>Development</h3>
+		<ul>
+			<li>Microsoft Windows XP or later</li>
+			<li>Microsoft Visual Studio 2012, 2013 and 2015</li>
+			<li>Xcode 6.3</li>
+		</ul>
 	</blockquote>
 	<h2>Changes and Resolved Issues</h2>
 	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
 
-	<h3>General</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>General</h3>
 		<ul>
-			<li>The friction model can now be changed after scene instantiation with PxScene::setFrictionType.  The friction model can also be queried with PxScene::getFrictionType.</li>
+			<li>Added support for Microsoft Visual Studio 2015 for Windows builds. Note that the GPU features are not currently supported with Visual Studio 2015.</li>
+			<li>Removed support for Microsoft Visual Studio 2010</li>
+			<li>Added support for Android platform API Level 16 and removed support for platform API Level 8 and 9.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Fixed a bug in aggregates that led to a crash when rigid bodies are added to an aggregate after removing all rigid bodies from an aggregate.   This only occurred with aggregates that were added to the scene and with rigid bodies that had shapes attached.</li>
+			</ul>
 		</ul>
-		<li>Changed:</li>
+
+		<h3>Rigid Bodies</h3>
 		<ul>
-			<li>PxDefaultSimulationFilterShader now supports particles and cloth as well.</li>
-			<li>PxSimulationFilterCallback: the provided actor and shape pointers are now defined as const. Note: this is no behavior change, it was never allowed to write to those objects from within the callback.</li>
-			<li>The PxTriangleMeshFlag::eHAS_16BIT_TRIANGLE_INDICES and PxTriangleMeshFlag::eHAS_ADJACENCY_INFO enums have been deprecated. Please use PxTriangleMeshFlag::e16_BIT_INDICES and PxTriangleMeshFlag::eADJACENCY_INFO instead.</li>
-			<li>Removed following functions from the API for platforms which do not support CUDA: PxGetSuggestedCudaDeviceOrdinal, PxCreateCudaContextManager, PxLoadPhysxGPUModule.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Creating a PxConstraint or using PxConstraint::setActors() could cause a crash in situations where one of the two newly connected actors was part of a previously simulated graph of connected constraints (with some of them having projection enabled, i.e., PxConstraintFlag::ePROJECT_TO_ACTOR0 or ::ePROJECT_TO_ACTOR1 set).</li>
+				<li>PxD6Joint::getTwist(), getSwingY(), getSwingZ() returned incorrect angle values when the corresponding components of the quaternion were negative</li>
+				<li>The thickness of a heightfield was incorrectly applied when the heightfield transform had a non-identity quaternion. </li>
+				<li>PxD6Joint angular projection now functions correctly when there is one free axis and it is not the twist axis.</li>
+				<li>The bounding box of a shape was not always properly updated when the contact offset changed.</li>
+				<li>Fixed an edge case bug in PCM contact gen that could result in a QNAN reported as the contact point.</li>
+				<li>Fixed an uninitialized variable bug in the GJK algorithm resulting in unintialized closest points reported.</li>
+				<li>Fixed an edge case in which the constraint solver could access invalid memory in constraint partitioning.</li>
+				<li>Fixed a bug in capsule vs heightfield contact generation that could produce incorrect penetration depths.</li>
+				<li>Fixed a bug in Unified MTD code path which transformed the normal twice for the polygon index calculation. </li>
+				<li>Fixed a crash in height fields when a capsule collided with an edge whose shared triangles had a hole material.</li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+
+		<h3>Serialization</h3>
 		<ul>
-			<li>Fixed concurrency issue on windows. Calling PxScene::simulate on multiple scenes concurrently may have caused a deadlock. This only happened if the scenes shared a single PxCpuDispatcher and the dispatcher was configured to use one worker thread only.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxTriangleMesh instances with adjacency information were not correctly initialized when created with cooking.createTriangleMesh. This caused a crash when converting the binary serialized triangle mesh data.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Rigid Bodies</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Scene Queries</h3>
 		<ul>
-			<li>The projection direction for constraints can now be specified through the flags PxConstraintFlag::ePROJECT_TO_ACTOR0, ::ePROJECT_TO_ACTOR1.</li>
-			<li>A parameter has been added to PxRigidBodyExt::updateMassAndInertia() and ::setMassAndUpdateInertia() to optionally take non-simulation shapes into account for computing the mass and the inertia tensor of a rigid body.</li>
-			<li>It is now possible to retrieve additional information in contact reports. See the API documentation of PxContactPairHeader.extraDataStream, PxPairFlag::ePRE_SOLVER_VELOCITY, ::ePOST_SOLVER_VELOCITY, ::eCONTACT_EVENT_POSE for details.</li>
-			<li>The contact report system has been extended to support multiple notification events if the same two objects collide multiple times in multipass CCD scenarios. See the API documentation of PxPairFlag::eNOTIFY_TOUCH_CCD for details.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Sweeps against scaled meshes.</li>
+			</ul>
 		</ul>
 
-		<li>Changed:</li>
+		<h3>Cooking</h3>
 		<ul>
-			<li>If touching objects were added to the scene asleep and one of them got woken up, then all contact pairs of the touching objects which contained a static rigid body were resolved with a delay of one simulation step. Now these pairs get resolved without delay in the next simulation step.</li>
-			<li>If touching objects were added to the scene asleep, eNOTIFY_TOUCH_FOUND contact reports were sent out for pairs of dynamic rigid bodies if requested. These reports will not be sent at the end of the simulation step after insertion anymore but rather at the end of the simulation step after the touching objects have been woken up.</li>
-			<li>Rigid bodies now permit zeroes in passed to setMass and setMassSpaceInertiaTensor. Zeroes are interpreted to indicate infinite mass or infinite moment of inertia around a given principal axis of inertia. Previously, zeroes were illegal values to these methods. Note that zeroes are still illegal for instances of PxArticulationLink.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Mesh cooking was sometimes crashing for meshes with less than 4 triangles. This has been fixed.</li>
+			</ul>
 		</ul>
 
-		<li>Fixed:</li>
+	</blockquote>
+
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.3</h1>
+	<h2 style="TEXT-ALIGN: center">January 2015</h2>
+
+	<h2>Supported Platforms</h2>
+	<blockquote>
+		<h3>Runtime</h3>
 		<ul>
-			<li>Reading back the kinematic target in the PxSimulationEventCallback::onContact() callback through PxRigidDynamic::getKinematicTarget() will now work.</li>
-			<li>Contact reports are no longer generated for contact pairs involving two sleeping kinematic actors or for pairs involving a sleeping kinematic actor in contact with a static actor.  This fixes a bug that was introduced in 3.3.0. </li>
-			<li>No PxPairFlag::eNOTIFY_TOUCH_LOST event was sent in contact reports if a pair of sleeping rigid bodies got woken up after setting the pose on one of them (with autowake parameter set to false) and if the bounding boxes of the two objects still overlapped.</li>
-			<li>No PxPairFlag::eNOTIFY_TOUCH_PERSISTS event was sent in contact reports during the first simulation step after a pair of sleeping rigid bodies got woken up.</li>
-			<li>The inertia tensor computation for convex meshes has been adjusted to be more stable in certain cases where floating point precision issues arise. Furthermore, a fallback routine has been added to use an approximation if the diagonalized inertia tensor still ends up with invalid entries.</li>
-			<li>PxRigidBody::clearForce() and ::clearTorque() did not properly clear the respective properties if used with force mode PxForceMode::eIMPULES or PxForceMode::eVELOCITY_CHANGE.</li>
-			<li>Setting PxSceneFlag::eENABLE_KINEMATIC_STATIC_PAIRS also enabled PxSceneFlag::eENABLE_KINEMATIC_PAIRS internally and vice versa.</li>
-			<li>Missing validation checks for some joint set() methods have been added. Similarly to other API calls, when validation fails in the checked build PhysX will report an error and return without updating the joint.</li>
-			<li>Switching a kinematic rigid body to dynamic could lead to a crash in a subsequent simulation step, if the kinematic was moved and connected to another kinematic through a breakable PxConstraint/PxJoint.</li>
-			<li>Deleting a breakable PxConstraint/PxJoint while the simulation is running could lead to a crash if the PxConstraint/PxJoint broke in the same simulation step.</li>
-			<li>A bug in the PxScene::addBroadPhaseRegion() function, that could lead to a crash when using 'populateRegion=true', has been fixed.</li>
+			<li>Apple iOS</li>
+			<li>Apple Mac OS X</li>
+			<li>Google Android ARM & x86 (version 2.2 or later required for SDK, 2.3 or later required for samples)</li>
+			<li>Linux (tested on Ubuntu)</li>
+			<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
+			<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
+			<li>Microsoft XBox One (SDK only, no samples)</li>
+			<li>Microsoft XBox 360</li>
+			<li>Nintendo Wii U</li>
+			<li>Sony Playstation 3</li>
+			<li>Sony Playstation 4 (SDK only, no samples)</li>
+			<li>Sony Playstation Vita</li>
 		</ul>
-	</ul>
-
-	<h3>Particles</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Development</h3>
 		<ul>
-			<li>Added triangle mesh cache statistics for GPU particles.  Triangle mesh cache statistics are also captured by PVD as part of simulation statistics.</li>
-			<li>Added new option to query approximate particle velocities relative to colliding rigid actors.  This can be used for debris rotation on moving objects.  Enable with PxParticleReadDataFlag::eCOLLISION_VELOCITY_BUFFER and read from PxParticleReadData::collisionVelocityBuffer.</li>
+			<li>Microsoft Windows XP or later</li>
+			<li>Microsoft Visual Studio 2010, 2012 and 2013</li>
+			<li>Xcode 6.2</li>
 		</ul>
-		<li>Fixed:</li>
+	</blockquote>
+	<h2>Known Issues</h2>
+	<blockquote>
 		<ul>
-			<li>Fixed a bug which might lead to GPU particle pipeline failures on low end GPUs.</li>
-			<li>Enabled warning when a spatial data structure overflow occured for GPU particles (see the guide for more information).</li>
+			<li>The combination of releasing an actor and reassigning the actors of any affected joint so that the joint no longer references the released actor will lead to a crash if these operations are performed as buffered calls ie after simulate but before fetchResults.</li>
 		</ul>
-	</ul>
+	</blockquote>
+	<h2>Changes and Resolved Issues</h2>
+	<blockquote>
 
-	<h3>Cloth</h3>
-	<ul>
-		<li>Fixed:</li>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+
+		<h3>General</h3>
 		<ul>
-			<li>PxFilterFlag::eSUPPRESS was ignored for collision pairs that involved a PxCloth object. This does work now, however, please note that PxFilterFlag::eSUPPRESS is treated as PxFilterFlag::eKILL for pairs with a PxCloth object.</li>
+			<li>Added support for Microsoft Visual Studio 2013 and removed support for Microsoft Visual Studio 2008.</li>
+			<li>Where applicable, mutexes on Unix flavored platforms now inherit priority to avoid priority inversion. This is a default behavior of Windows mutexes.</li>
+			<li>Added arm64 support for the iOS version of the PhysX SDK.</li>
+			<li>Removed samples from the iOS version of the PhysX SDK.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>x64 samples running on Windows 8.</li>
+				<li>Concurrent calls to PxPhysics::(un)registerDeletionListener() and PxPhysics::(un)registerDeletionListenerObjects() were not safe.</li>
+				<li>PxGeometryQuery::pointDistance() could sometimes read uninitialized memory. This has been fixed.</li>
+				<li>The SDK will not crash anymore if an object is involved in more than 65535 interactions. Instead, it will emit an error message and the additional interactions will be ignored.</li>
+				<li>The PxPhysics::getMaterials() function did not work with a non-zero 'startIndex' parameter. This has been fixed.</li>
+				<li>The following static Android libs are now packed into a single library called PhysX3: LowLevel, LowLevelCloth, PhysX3, PhysXProfileSDK, PhysXVisualDebuggerSDK, PvdRuntime, PxTask, SceneQuery and SimulationController. This fixes cyclic dependencies between these libraries.</li>
+				<li>FSqrt(0), V4Sqrt(0), V4Length(0) and V3Length(0) will return 0 instead of QNan in Android and iOS.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Serialization</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Rigid Bodies</h3>
 		<ul>
-			<li>Support for binary compatibility between different sdk releases and patches has been added (PX_BINARY_SERIAL_VERSION). The current sdk version can load binary data of the older sdk versions listed in the documentation of PX_BINARY_SERIAL_VERSION. </li>
-			<li>SnippetLoadCollection has been added. It illustrates loading repx or binary serialized collections and instatiating the objects in a scene. It only compiles and runs on authoring platforms (windows, osx and linux). </li>
-			<li>SnippetConvert has been added. It illustrates how to convert PhysX 3 serialized binary files from one platform to another. It only compiles and runs on authoring platforms (windows, osx and linux). </li>
+			<li>Fixed:</li>
+			<ul>
+				<li>The Prismatic joint limit now acts correctly when its frame is not the identity.</li>
+				<li>Calling PxRigidDynamic::setGlobalPose() with the autowake parameter set to true could result in an assert when the rigid body got released and had the PxActorFlag::eDISABLE_SIMULATION flag set.</li>
+				<li>Added errors on misuse of PxRegister[Unified]Heightfields() function, and documented it.</li>
+				<li>PxConstraint has a eDISABLE_PREPROCESSING flag and minResponseThreshold attribute to assist in stabilizing stiffness caused by infinite inertias or mass modification.</li>
+				<li>Island manager performance in the presence of large numbers of kinematic actors has been improved.</li>
+				<li>Using PxConstraint::setActors() or PxJoint::setActors() could cause a crash if the new actors resulted in the constraint/joint being removed from the scene.</li>
+				<li>The functions PxRigidActor::setGlobalPose and PxShape::setLocalPose failed to update cached contact data internal to PhysX.  This led to contacts being generated with transforms that were no longer valid.  Similarly, contacts could be missed due to transforms being invalid.  This defect affected the classes PxRigidStatic and PxRigidDynamic, though it was more immediately noticeable with the PxRigidStatic class.</li>
+				<li>The sphere-vs-mesh contact generation code has been improved. It could previously generate wrong contacts. This has been fixed.</li>
+				<li>The capsule-vs-convex contact generation had a bug that could lead to rare invalid contacts. This has been fixed.</li>
+				<li>The mesh contact generation had a bug on PS3 that could lead to invalid contacts. This has been fixed.</li>
+				<li>The PxRigidBody::clearForce() and PxRigidBody::clearTorque() were not properly decoupled - they both cleared the force and the torque. This has been fixed.</li>
+				<li>Repeatedly calling PxRigidActor::attachShape and PxRigidActor::detachShape in between calls to simulate resulted in a memory leak.  This has been fixed.</li>
+			</ul>
+			<li>Added:</li>
+			<ul>
+				<li>Enabling CCD on kinematic actors is now disallowed. When the kinematic flags are raised on a CCD-enabled actor, CCD is automatically disabled. </li>
+			</ul>
 		</ul>
-			<li>Deprecated:</li>
+
+		<h3>Particles</h3>
 		<ul>
-			<li>Method PxCollection::addRequire is deprecated, use PxCollection::add and PxCollection::contains instead. </li>
-			<li>Method PxCollection::createBinaryConverter(PxSerializationRegistry&) is deprecated, use PxCollection::createBinaryConverter() instead. </li>
+			<li>Fixed:</li>
+			<ul>
+				<li>
+					Consistency between GPU and CPU particles has been improved in the case of a spatial date structure overflow. The positions and velocities of particles that have the PxParticleFlag::eSPATIAL_DATA_STRUCTURE_OVERFLOW set
+					are now updated also for CPU particles.
+				</li>
+				<li>Fixed potential deadlocks from occurring in the GPU particle kernels running on GM204 and above GPUs.</li>
+				<li>Fixed fluid simulation crash on Titan X.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Character controller</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Cloth</h3>
 		<ul>
-			<li>PxControllerManager::setPreventVerticalSlidingAgainstCeiling() has been added, to control the behaviour of characters against ceilings.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>A bug related to hitting the sphere or plane limit while world collision is enabled has been fixed.</li>
+				<li>PxCloth::getParticleAccelerations() implementation was fixed for GPU cloth.</li>
+			</ul>
 		</ul>
-	</ul>
 
-
-	<h3>Vehicles</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Character controller</h3>
 		<ul>
-			<li>Vehicles may now be updated concurrently through the addition of a new function PxVehiclePostUpdates and passing a PxVehicleConcurrentUpdateData array to PxVehicleupdates.</li>
-			<li>A new snippet SnippetVehicleMultiThreading has been added to show the operation of concurrent vehicle updates.</li>
-			<li>PxVehicleDriveTankControl and PxVehicleDriveTankControlModel now have improved doxy comments.</li>
-			<li>A new function PxVehicleUpdateCMassLocalPose has been added to help update a vehicle after the center of mass pose of the vehicle's actor has been modified.</li>
-			<li>PxWheelQueryResult now records the local pose of the wheel.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Character controllers cannot stand on dynamic triggers anymore.</li>
+			</ul>
+			<li>Added:</li>
+			<ul>
+				<li>added lockingEnabled parameter to PxCreateControllerManager(), to support thread-safe release of objects while the character controller's move() routine is executing.</li>
+			</ul>
 		</ul>
-		<li>Changed:</li>
+
+		<h3>Scene Queries</h3>
 		<ul>
-			<li>PxVehcicleDrive4W::setup now tests that at least 4 wheels are specified and returns wtih an error message if numWheels < 4.  It is only possible to create a PxVehicleDrive4W instance with less than 4 active wheels by disabling wheels after instantiating a 4-wheeled car.</li>
-			<li>In debug and checked build configurations PxVehicleComputeSprungMasses now reports whether the sprung masses were successfully computed.   Warnings are passed to the error stream in checked configuration if the function does not complete successfully.  Apart from error checking the operation of the function is unchanged.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Raycasts against triangle meshes with large scales potentionaly failed to register a hit.</li>
+				<li>Overlaps against height field meshes with the flag eNO_BOUNDARY_EDGES potentionaly failed to register a hit.</li>
+				<li>Sweeps using shapes modelled around a space significantly offset from their geometric center could fail to register a hit. </li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+
+		<h3>Vehicles</h3>
 		<ul>
-			<li>The doxy comment describing the default setting for PxVehicleWheelsSimData::setWheelShapeMapping was incorrect.  This now correctly documents the default mapping as PxVehicleWheelsSimData::setWheelShapeMapping(i,i).</li>
-			<li>Suspensions raycasts that start inside geometry are ignored for all geometry types.  Prior to this release this was true for all geometry types except for heightfields and triangle meshes.  This inconsistency has now been fixed so that all geometry types obey the rule that suspension raycasts starting inside geometry are neglected.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Sticky tire friction was unreliable with more than one substep but is now fixed.  This defect led to vehicles sliding down slopes where the sticky friction should have held firm.</li>
+				<li>An error in the jounce speed calculation that led to lift force at high forward speeds has been fixed.  This defect led to instabilities at high speed.</li>
+				<li>Improved documentation for PxVehicleSuspsensionData::mSprungMass.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Scene queries</h3>
-	<ul>
-	    <li>Added:</li>
-	    <ul>
-	        <li>Added eMTD flag. If an initial overlap is detected, this flag triggers the sweep to compute the MTD (Minimum Translation Direction), which can be used to de-penetrate the query shape from the shape with which an initial overlap was found. In this case, the distance reported will be negative. This negative distance can be used to scale the reported normal to generate the translation vector required to de-penetrate the query shape. </li>
-	        <li>Added PxTriangle::pointFromUV.</li>
-	    </ul>
-		<li>Fixed:</li>
+		<h3>Cooking</h3>
 		<ul>
-			<li>A rare ray-capsule intersection bug has been fixed, when the capsule's height is close to zero.</li>
-			<li>A capsule-capsule distance bug has been fixed, when the tested capsules are large and parallel.</li>
-			<li>Raycasts against heightfields now correctly return triangle UVs.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Convex meshes generated from PhysX 3.2 were not able to load inside PhysX 3.3.</li>
+			</ul>
 		</ul>
-		<li>Changed:</li>
+
+	</blockquote>
+
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.2</h1>
+	<h2 style="TEXT-ALIGN: center">September 2014</h2>
+
+	<h2>Supported Platforms</h2>
+	<blockquote>
+		<h3>Runtime</h3>
 		<ul>
-			<li>PxBatchQuery::raycast, overlap and sweep previously had an incorrect const modifier indicating that these methods were safe to call from multiple threads simultaneously. This has been removed.
-			Multiple batch queries can still be executed (via PxBatchQuery::execute()) in parallel.</li>
+			<li>Apple iOS</li>
+			<li>Apple Mac OS X</li>
+			<li>Google Android ARM & x86 (version 2.2 or later required for SDK, 2.3 or later required for samples)</li>
+			<li>Linux (tested on Ubuntu)</li>
+			<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
+			<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
+			<li>Microsoft XBox One (SDK only, no samples)</li>
+			<li>Microsoft XBox 360</li>
+			<li>Nintendo Wii U</li>
+			<li>Sony Playstation 3</li>
+			<li>Sony Playstation 4 (SDK only, no samples)</li>
+			<li>Sony Playstation Vita</li>
 		</ul>
-	</ul>
-
-	<h3>Cooking</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>PxCookingParams::meshSizePerformanceTradeOff parameter can be used to make the mesh smaller at the expense of reduced simulation and scene query performance (or the other way around).</li>
-			<li>PxCookingParams::meshCookingHint parameter can be used to specify mesh hierarchy construction preference (cooking speed or simulation speed).</li>
-			<li>PxMeshPreprocessingFlag::eDISABLE_CLEAN_MESH disables mesh clean proces. Vertices duplicities are not searched, huge triangles test is not done. Vertices welding is not done. Does speed up the cooking.</li>
-			<li>PxMeshPreprocessingFlag::eDISABLE_ACTIVE_EDGES_PRECOMPUTE disables vertex edge precomputation. Makes cooking faster but slow up contact generation.</li>
-			<li>PxPhysicsInsertionCallback adds the support for inserting cooked triangle mesh or height field directly into PxPhysics without the stream serialization.</li>
-			<li>PxCooking::createTriangleMesh creates triangle mesh and inserts it to PxPhysics without using the stream serialization.</li>
-			<li>PxCooking::createHeightField creates height field and inserts it to PxPhysics without using the stream serialization.</li>
-			<li>PxCooking::validateTriangleMesh validates mesh in separate function before it can be cooked without the mesh cleaning.</li>
-			<li>PxConvexFlag::eCHECK_ZERO_AREA_TRIANGLES checks and removes almost zero-area triangles during the computation of the convex hull.</li>
-			<li>PxCookingParams::areaTestEpsilon triangle area size was added. This epsilon is used for the zero-area test in the computation of the convex hull.</li>
-		</ul>
-		<li>Changed:</li>
-		<ul>
-			<li>PxCooking::computeHullPolygons does now return also vertices used by the polygons. Redundant vertices are removed.</li>
-			<li>PxCooking::cookConvexMesh now returns a PxConvexMeshCookingResult::Enum with additional error information.</li>
-		</ul>
-	</ul>
-
-	<h3>Aggregates</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>PxSceneLimits has a new member variable maxNbAggregates.  Setting this value to a good approximation of the peak number of aggregates will avoid the need for internal run-time allocations that can occur when aggregates are added to the scene. </li>
-		</ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>PxScene::removeActor will auotmatically remove that actor from all PxAggregate instances that contain the removed actor.  Likewise, PxScene::removeArticulation will automatically remove all articulation links from all relevant aggregates.  This fix upholds the rule that all actors of an aggregate must be in same scene.</li>
-			<li>The condition of an internal assert that triggered after calling PxScene::addAggregate has been corrected. This assert triggered when an aggregate was added to the scene after removal of all aggregates from the scene.  The operation of the function PxScene::addAggregate is unchanged apart from the asserted condtition.</li>
-		</ul>
-	</ul>
-
-	<h3>Samples</h3>
-	<ul>
-		<li>Changed:</li>
+		<h3>Development</h3>
 		<ul>
-			<li>Starting with Release 302 drivers, application developers can direct the Optimus driver at runtime to use the High Performance Graphics to render any application - even those applications for which there is no existing application profile. The samples now make use of this feature to enable High Performance Graphics by default.</li>
+			<li>Microsoft Windows XP or later</li>
+			<li>Microsoft Visual Studio 2008, 2010, 2012 (Windows RT only)</li>
+			<li>Xcode 4.6|5.0|5.0.1</li>
 		</ul>
-	</ul>
-	</blockquote>
-
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3</h1>
-<h2 style="TEXT-ALIGN: center">September 2013</h2>
-
-	<h2>Release Highlights</h2>
-	<blockquote>
-	<ul>
-		<li>Added PhysXDevice/64.dll to the PC packages. See the Windows readme for details.</li>
-		<li>Added support for the NVIDIA Kepler GPU architecture.</li>
-		<li>Added support for the Nintendo Wii U console.</li>
-		<li>Added support for Windows 8 Modern UI applications (ARM and x86).</li>
-		<li>Ported our SIMD library to the ARM NEON architecture.</li>
-		<li>Multi Box Pruning (MBP) is offered as an alternative broad phase algorithm to Sweep And Prune (SAP). MBP shows improved performance when most objects are moving or when inserting large numbers of objects. SAP can be faster in scenarios with few moving (many sleeping) objects.</li>
-		<li>Significant performance and stability optimizations for rigid body solver.</li>
-		<li>New function to compute the minimum translational distance and direction to separate two overlapping geometry objects.</li>
-		<li>New 'PCM' contact generation mode which is often faster and more robust than the still available legacy path.</li>
-		<li>Improved performance of scene queries and contact reports.</li>
-		<li>Improved behavior and performance of Continuous Collision Detection (CCD).</li>
-		<li>Reduced memory footprint of rigid body classes.</li>
-		<li>Added support for sharing shapes among rigid bodies.</li>
-		<li>Significantly improved cloth behavior and GPU performance.</li>
-		<li>Added support for cloth colliding against itself, other cloth instances, and scene geometry.</li>
-		<li>Improved useability of binary and xml serialization.</li>
-		<li>Memory can be saved for objects that do not participate in the simulation and are used for scene queries only. For details see the new flag PxActorFlag::eDISABLE_SIMULATION.</li>
-	</ul>
-	</blockquote>
-
-	<h2>Supported Platforms</h2>
-	<blockquote>
-    <h3>Runtime</h3>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-		<li>Microsoft XBox One</li>
-        <li>Microsoft XBox 360</li>
-		<li>Nintendo Wii U</li>
-        <li>Sony Playstation 3</li>
-		<li>Sony Playstation 4 (SDK only, no samples yet)</li>
-		<li>Sony Playstation Vita</li>
-    </ul>
-    <h3>Development</h3>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010, 2012 (Windows RT/PS4/XboxOne only)</li>
-		<li>Xcode 4.6</li>
-    </ul>
 	</blockquote>
-
 	<h2>Known Issues</h2>
 	<blockquote>
-	<ul>
-		<li>PxSimulationOrder::eSOLVE_COLLIDE feature is not implemented in this release.  Calls to PxScene::solve() and PxScene::collide() will be ignored with a warning added to the error stream</li>
-		<li>Reading back the kinematic target through PxRigidDynamic::getKinematicTarget() in the PxSimulationEventCallback::onContact() callback will fail.</li>
-		<li>Cloth self-collision without rest position is not supported on GPUs with SM capability lower than 2.0.</li>
-	</ul>
+		<ul>
+			<li>The combination of releasing an actor and reassigning the actors of any affected joint so that the joint no longer references the released actor will lead to a crash if these operations are performed as buffered calls ie after simulate but before fetchResults.</li>
+		</ul>
 	</blockquote>
-	<h2>Changes and Resolved Issues</h3>
+	<h2>Changes and Resolved Issues</h2>
 	<blockquote>
 
-	<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
 
-	<h3>General</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>PxScene::setLimits: Hint to preallocate capacities of various data structures. See PxSceneLimits.</li>
-			<li>Scalar constructors PxQuat(r), PxMat33(r), PxMat33(r).</li>
-			<li>identity constructors for PxMat33, PxMat44, PxQuat, PxTransform - e.g. PxTransform(PxIdentity).</li>
-			<li>zero constructors for PxMat33, PxMat44, PxVec3 - e.g. PxMat33(PxZero).</li>
-			<li>PxTransform(x, y, z) constructor was added as a shortcut for PxTransform(PxVec3(x,y,z)).</li>
-			<li>The GPU code uses CUDA version 5.0 and supports Kepler GPUs (SM version 3.0 and 3.5).
-			<li>Helper method PxContactPair::bufferContacts() has been added to copy the contact pair data stream into a user buffer.</li>
-			<li>PxGeometryQuery::computePenetration() has been added, to compute the Minimum Translational Distance between geometry objects.</li>
-			<li>Ability for APEX and other PhysX extensions to change the PhysX Visual Indicator.</li>
-			<li>Reporting allocation names can now be enabled or disabled (see PxFoundation::setReportAllocationNames). When enabled, some platforms allocate memory through 'malloc'.</li>
-			<li>The scene origin can now be shifted to better support big world scenarios. See PxScene::shiftOrigin() for details.</li>
-			<li>PxAssertHandler got extended with a boolean parameter to support ignoring specific asserts. See windows implementation of DefaultAssertHandler.</li>
-			<li>Added new PxPairFlags - PxPairFlag::eSOLVE_CONTACT and PxPairFlag::eDETECT_DISCRETE_CONTACT.</li>
-		</ul>
-		<li>Removed:</li>
-		<ul>
-			<li>The obsolete PxConvexFlag::eUSE_UNCOMPRESSED_NORMALS flag has been removed.</li>
-			<li>The PxSceneFlag::eDISABLE_SSE was obsolete, and has now been removed.</li>
-			<li>The obsolte PxPtrArray has been removed</li>
-		</ul>
-		<li>Changed:</li>
-		<ul>
-			<li>Mesh BVH construction was significantly improved for meshes with a mix of large and small triangles. Mesh sizes are now slightly increased, but traversals are substantially faster. As a side effect, cooked mesh format has changed. This requires meshes to be recooked!</li>
-			<li>The specification for valid PxBounds3 representations has changed. See the API documentation for details (especially the newly introduced PxBounds3::isValid() method).</li>
-			<li>PxBounds3::transform(), ::scale() and ::fatten() have been split into a fast and a safe version to avoid unnecessary checks and improve stability for empty bounds respectively.</li>
-			<li>PxBounds3::setInfinite() has been renamed to PxBounds3::setMaximal() to better fit the actual behavior.</li>
-			<li>PxTask: PVD profile events for tasks are now only emitted in profile builds (all platforms).</li>
-			<li>Platform mutex implementations now verify that lock/unlock come from correct thread in debug builds.</li>
-			<li>PxWindowsDelayLoadHook.h has been moved from Include/foundation/windows to Include/common/windows.</li>
-			<li>API instances of 'Num' as in 'maxNum' and 'getNum' have changed uniformly to 'Nb'.</li>
-			<li>The following classes have been renamed:
-				<ul>
-					<li>PxSceneQueryHit to PxQueryHit</li>
-					<li>PxSceneQueryFlags to PxHitFlags</li>
-					<li>PxSceneQueryHitType to PxQueryHitType</li>
-					<li>PxSceneQueryFilterData to PxQueryFilterData</li>
-					<li>PxSceneQueryFilterCallback to PxQueryFilterCallback</li>
-					<li>PxSceneQueryFilterFlags to PxQueryFlags</li>
-					<li>PxSceneQueryCache to PxQueryCache</li>
-					<li>PxCCTNonWalkableMode to PxControllerNonWalkableMode</li>
-					<li>PxControllerFlags to PxControllerCollisionFlags</li>
-					<li>PxCCTHit to PxControllerHit</li>
-					<li>PxConstraintDominance to PxDominanceGroupPair</li>
-					<li>PxActorTypeSelectionFlags to PxActorTypeFlags</li>
-					<li>PxFindOverlapTriangleMeshUtil to PxMeshOverlapUtil</li>
-				</ul>
-				The previous names have been retained for compatibility but are deprecated.</li>
-			<li>PX_SLEEP_INTERVAL has been replaced with the new parameter PxSceneDesc::wakeCounterResetValue to specify the wake counter value to set when wakeUp() gets called on dynamic objects.</li>
-			<li>PxClientBehaviorBit has been renamed PxClientBehaviorFlag, PxActorClientBehaviorBit has been renamed PxActorClientBehaviorFlag. Names of related functions have also changed.</li>
-			<li>queryClient parameter in raycast(), sweep(), overlap() functions was moved inside of PxQueryFilterData struct.</li>
-			<li>The PxObserver/PxObservable system has been replaced by the PxDeletionListener API. The supported object types have been extended from PxActor to all core objects inheriting from PxBase. Furthermore, two kinds of deletion events are now distinguished: user release and memory release. Please read the API documentation for details.</li>
-			<li>Deprecated PxPairFlag::eRESOLVE_CONTACT. Use PxPairFlag::eDETECT_DISCRETE_CONTACT and PxPairFlag::eSOLVE_CONTACT instead.</li>
-			<li>Number of materials per shape is now PxU16 instead of PxU32, contact material information also now returns PxU16.</li>
-			<li>Maximum number of touching hits in batched queries is now PxU16 instead of PxU32.</li>
-			<li>SweepEpsilonDistance has been replaced by meshContactMargin and marked as deprecated. Please read the API documentation for details.</li>
-			<li>PxShape::resetFiltering() and PxParticleBase::resetFiltering() have been deprecated. Please use one of the new overloaded methods PxScene::resetFiltering() instead.</li>
-			<li>The pxtask namespace has been removed and it's types have been added to the physx namespace with a Px* prefix</li>
-			<li>The delay load hook PxDelayLoadHook::setPhysXInstance has been renamed to PxSetPhysXDelayLoadHook and PxDelayLoadHook::setPhysXCookingInstance has been renamed to PxSetPhysXCookingDelayLoadHook</li>
-		</ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>Calling Thread::setAffinityMask() before the thread has been started is now supported.</li>
-			<li>PxDefaultSimulationFilterShader ignored the value of the second filter constant in the collision filtering equation and used the value of the first filter constant instead.</li>
-		</ul>
-		<li>Deprecated:</li>
+		<h3>General</h3>
 		<ul>
-			<li>The PxScene::flush() method has been deprecated, please use PxScene::flushSimulation().</li>
-			<li>PxRigidDynamicFlag has been deprecated and replaced with PxRigidBodyFlag to allow flags to be shared between PxArticulationLink and PxRigidDynamic.</li>
-			<li>PxTransform::createIdentity(), PxQuat::createIdentity(), PxMat33::createIdentity(), PxMat44::createIdentity(), PxMat33::createZero(), PxMat44::createZero()</li>
+			<li>Added:</li>
+			<ul>
+				<li>The PhysXCommon/64.dll, nvcuda.dll and PhysXUpdateLoader/64.dll are loaded and checked for the NVIDIA Corporation digital signature. The signature is expected on all NVIDIA Corporation provided dlls. The application will exit if the signature check fails.</li>
+				<li>Added the PxDefaultBufferedProfiler extension for simplified SDK profile events extraction.</li>
+				<li>PxSceneDesc::sanityBounds allows a bounding box to be set for validating the position coordinates of inserted or updated rigid actors and articulations.</li>
+				<li>Linux: Now supports GPU PhysX.</li>
+				<li>Added set/getRunProfiled() for PxDefaultCpuDispatcher to control profiling at task level.</li>
+				<li>Android: Support for x86 based devices was added.</li>
+				<li>PxProfileEventHandler::durationToNanoseconds() added. Translates event duration in timestamp (cycles) into nanoseconds.</li>
+				<li>Added SnippetProfileZone to show how to retrieve profiling information.</li>
+				<li>Added SnippetCustomJoint to better illustrate custom joint implementation, and removed SnippetExtension.</li>
+				<li>Added SnippetStepper to demonstrate kinematic updates while substepping with tasks.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxTask::runProfiled() now takes threadId as a parameter.</li>
+				<li>The static pruner now issues a performance warning in debug and checked configurations when a tree rebuild occurs and the tree is not empty.</li>
+				<li>PxSceneDesc::staticStructure now defaults to PxPruningStructure::eDYNAMIC_AABB_TREE.</li>
+				<li>Linux: Switched to shared libraries.</li>
+				<li>Profile zone event names changed to match function calls.</li>
+				<li>Overlapping read/write errors will now issue a PxErrorCode::eINVALID_OPERATION rather than PxErrorCode::eDEBUG_INFO.</li>
+				<li>Improved SnippetToleranceScale to better demonstrate the intended use case.</li>
+				<li>Increased 126 characters limit for warnings on unix platforms, 1k limit on all platforms.</li>
+				<li>PhysXCommon dll load within PhysX dll now respects dll name. Please see the manual's PhysXCommon DLL load section.</li>
+				<li>Significant revision of the user's guide.  Both structure and most content have been modified.</li>
+				<li>Fixed search function of user's guide.</li>
+				<li>Foundation math classes now have in-place arithmetic operators (+= etc).</li>
+				<li>PxScene::checkResults() no longer counts as a read API method. Hence it is now possible to call this method in blocking mode without causing all writes to block until it returns.</li>
+			</ul>
+			<li>Deprecated:</li>
+			<ul>
+				<li>Indexing operators taking signed integers in PxVec3, PxVec4, PxMat33, PxMat44, PxStrideIterator have been deprecated.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Character controller</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>The PxControllerBehaviorFlag::eCCT_USER_DEFINED_RIDE flag has been added.</li>
-			<li>A new helper function PxController::resize() has been added to facilitate character controller resizing.</li>
-			<li>A new runtime tessellation feature has been added that can help reducing FPU accuracy issues in the sweep tests.</li>
-			<li>PxControllerFilterCallback has been added to make CCT-vs-CCT filtering more flexible.</li>
-			<li>An overlap recovery module has been added. See PxControllerManager::setOverlapRecoveryModule</li>
-			<li>PxObstacle notifications has been added to handle touched obstacles.</li>
-			<li>PxObstacleContext::getObstacleByHandle has been added.</li>
-			<li>The origin of character controllers and obstacles can now be shifted to stay in sync when the origin of the underlying PxScene is shifted. See PxControllerManager::shiftOrigin() for details.</li>
-		</ul>
-		<li>Changed:</li>
+		<h3>Rigid Bodies</h3>
 		<ul>
-			<li>The PxControllerManager is now tightly coupled to a PxScene which has to be provided on creation. See PxCreateControllerManager().</li>
-			<li>The PxObstacleContext instances of a PxControllerManager will now get released automatically when the manager is released.</li>
-			<li>PxController::reportSceneChanged() has been renamed to PxController::invalidateCache().</li>
-			<li>PxControllerBehaviorFlag::eCCT_CAN_RIDE_ON_OBJECT is not the default behavior anymore.</li>
-			<li>PxCCTNonWalkableMode::eFORCE_SLIDING has been renamed to PxCCTNonWalkableMode::ePREVENT_CLIMBING_AND_FORCE_SLIDING.</li>
-			<li>PxCCTInteractionMode has been renamed PxControllerInteractionMode</li>
-			<li>PxCCTNonWalkableMode has been renamed PxControllerNonWalkableMode</li>
-			<li>PxCCTHit has been renamed PxControllerHit</li>
-			<li>Touched PxObstacle has been replaced by touched ObstacleHandle.</li>
-			<li>PxControllerInteractionMode and PxControllerFilters::mActiveGroups have been removed. Please use the new PxControllerFilterCallback instead.</li>
-		</ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>Bugs with respect to deleting shapes from a touching actor have been fixed.</li>
-			<li>Touched actor's scene is checked in CCT::move before rideOnTouchedObject is called to ensure that the touched shape is valid.</li>
-			<li>Touched shape's scene query flag is checked in CCT::move before rideOnTouchedObject is called to ensure that the touched shape is valid.</li>
-			<li>Touched shape's user CCT filtering is triggered in CCT::move before rideOnTouchedObject is called to ensure that the touched shape is valid.</li>
-			<li>The PxControllerBehaviorCallback was not called when a PxUserControllerHitReport was not defined.</li>
-			<li>It is not possible anymore to create a CCT whose contact offset is zero. The doc has always said a non-zero value was expected, but corresponding check was missing.</li>
-    		<li>Box & capsule controllers' resize function now properly take the up direction into account</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>A minor bug in contact generation between capsules and triangle meshes has been fixed, reducing the amount of tunneling cases when CCD is not used.</li>
+				<li>Discrete contact reports are no longer produced for pairs without PxPairFlag::eDETECT_DISCRETE_CONTACT raised in the filter shader. Previously, discrete contact generation would always have been performed regardless of the presence of the PxPairFlag::eDETECT_DISCRETE_CONTACT flag. This change potentially improves performance when using  specific shapes for CCD-only collision, which would have previously generated discrtete contacts and then ignored them in the solver. </li>
+				<li>Trigger reports are no longer produced for pairs without PxPairFlag::eDETECT_DISCRETE_CONTACT raised in the filter shader. PxPairFlag::eTRIGGER_DEFAULT has been modified to include the PxPairFlag::eDETECT_DISCRETE_CONTACT flag.</li>
+				<li>An incorrect PX_DEPRECATED annotation on the default constructor for PxD6JointDrive has been removed.</li>
+				<li>PxRigidDynamic::getKinematicTarget() returned a wrong transform if the actor center of mass pose was different from the actor global pose.</li>
+				<li>Switching a PxRigidDynamic from dynamic to kinematic did not properly suppress existing pairs which turned into kinematic-kinematic or kinematic-static pairs.</li>
+				<li>PxRigidDynamic::isSleeping() did not return the correct value on the frame the object got inserted if PxScene::addActors() was used and if the object was awake.</li>
+				<li>PxSceneFlag::eDISABLE_CONTACT_CACHE now correctly works on PS3/SPU.</li>
+				<li>If an object was added to the scene, put asleep and had overlap with another sleeping object then contact points for that pair might not have been reported once the object woke up.</li>
+				<li>Potential crash when calling PxScene::resetFiltering() mulitple times for the same actor while the simulation was running has been fixed.</li>
+				<li>Potential crash when using PxScene::resetFiltering() with shapes that were just added while the simulation was running has been fixed.</li>
+				<li>A crash in MBP when deleting an object that just went out of broad-phase bounds has been fixed.</li>
+				<li>A new drive mode has been added to drive articulation joints using rotation vectors.</li>
+				<li>In contact and trigger reports, the shape references in PxTriggerPair and PxContactPair might not have been properly marked as removed shapes if the removal took place while the simulation was running.</li>
+				<li>PxPairFlag::eSOLVE_CONTACT is now properly observed if the flag is not set on a contacting pair. A consequence of this fix is that sleeping actors will no longer be woken up due to contact or lost contact with awake actors if PxPairFlag::eSOLVE_CONTACT is not set for the pair.  This also affects kinematic-kinematic pairs if one kinematic of the pair moves out of contact with the other.  Behaviour is unaffected for any pair that has PxPairFlag::eSOLVE_CONTACT set.</li>
+				<li>A memory leak with buffered shape and actor removal has been fixed.  The memory leak occurred when the release of an actor's shapes was followed by the release of the actor, all in-between simulate() and fetchResults().</li>
+				<li>A bug was fixed which caused incorrect force reports to sometimes be reported.</li>
+				<li>Fixed a bug where incorrect normals were reported when using PCM contact gen.</li>
+				<li>Fixed some issues with scaled convex hulls in the PCM contact gen code path.</li>
+				<li>The accuracy of capsule collision code has been improved.</li>
+				<li>An isValid() method has been added to constraints, that is satisfied if and only if at least one actor is a dynamic body or articulation link</li>
+				<li>A check has been added to prevent constraint construction or modification that would leave the constraint invalid.</li>
+				<li>In checked builds the PxScene methods addActor(), addActors(), addAggregate() and addCollection() will warn and return if an invalid constraint would be added to the scene</li>
+			</ul>
+			<li>Deprecated:</li>
+			<ul>
+				<li>The following flags have been renamed and deprecated because the name did not properly reflect the root cause.</li>
+				<ul>
+					<li>PxContactPairHeaderFlag</li>
+					<ul>
+						<li>eDELETED_ACTOR_0 (use eREMOVED_ACTOR_0 instead)</li>
+						<li>eDELETED_ACTOR_1 (use eREMOVED_ACTOR_1 instead)</li>
+					</ul>
+					<li>PxContactPairFlag</li>
+					<ul>
+						<li>eDELETED_SHAPE_0 (use eREMOVED_SHAPE_0 instead)</li>
+						<li>eDELETED_SHAPE_1 (use eREMOVED_SHAPE_1 instead)</li>
+					</ul>
+					<li>PxTriggerPairFlag</li>
+					<ul>
+						<li>eDELETED_SHAPE_TRIGGER (use eREMOVED_SHAPE_TRIGGER instead)</li>
+						<li>eDELETED_SHAPE_OTHER (use eREMOVED_SHAPE_OTHER instead)</li>
+					</ul>
+				</ul>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>CCD</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>Introduced PxRigidBodyFlag::eENABLE_CCD_FRICTION to control whether friction is applied inside CCD on a given body. This is disabled by default. In general, disabling friction in CCD improves the behavior of high-speed collisions.</li>
-			<li>Introduced PxSceneDesc::ccdMaxPasses field, which controls the maximum number of CCD passes. Each CCD pass will advance each object to its next TOI. By default, we use 1 pass. Increasing the number of passes can reduced the likelihood of time being dropped inside the CCD system at the cost of extra CCD procesing overhead.</li>
-			<li>Introduced per-body value to control how far past the initial time of impact the CCD advances the simulation. Advancing further can improve fluidity at the increased risk of tunneling.</li>
-			<li>CCD now has limited support contact modification. It supports disabling response to contacts by setting max impulse to 0. It does not yet support target velocity, non-zero max impulse values or scaling mass or inertia.</li>
-			<li>Introduced new PxPairFlag eDETECT_CCD_CONTACT. This flags is used to control whether CCD performs sweep tests for a give pair. Decision over whether any collisions are responded to is made by the presence of the flag eSOLVE_CONTACT.</li>
-		</ul>
-		<li>Removed:</li>
+		<h3>Vehicles</h3>
 		<ul>
-			<li>CCD is now enabled per-body instead of per-shape. As a result, PxShapeFlag::eUSE_SWEPT_BOUNDS has been removed and replaced with PxRigidBodyFlag::eENABLE_CCD.</li>
-		</ul>
-		<li>Changed:</li>
-		<ul>
-			<li>API attributes previously named SWEPT_INTEGRATION have now been renamed 'ccd': specifically PxSceneFlag::eENABLE_SWEPT_INTEGRATION, PxSceneFlag::eSWEPT_INTEGRATION_LINEAR, PxSceneDesc::sweptIntegrationLinearSpeedFactor, PxSceneDesc::sweptIntegrationAngularSpeedFactor, PxPairFlag::eENABLE_SWEPT_INTEGRATION</li>
-			<li>PxPairFlag::eCCD_LINEAR has been deprecated. Use (PxPairFlag::eDETECT_CCD_CONTACT | PxPairFlag::eSOLVE_CONTACT) instead.</li>
-		</ul>
-		<li>Fixed:</li>
-		<ul>
-			<li>Updated CCD algorithm which improves the fluidity of motion of the CCD system while reducing the processing overhead significantly.</li>
-			<li>Contact notification is now reliable in CCD. CCD contacts are appended to the end of the contact list for a given pair so that both discrete and continuous contacts are reported</li>
-			<li>CCD contact notification now reports applied forces. These contacts will be correctly filtered by force thresholds.</li>
+			<li>Added:</li>
+			<ul>
+				<li>In profile config the functions PxVehicleUpdates, PxVehiclePostUpdates and PxVehicleSuspensionRaycasts are now tracked with profile events (provided that a PxProfileZoneManager instance was passed to PxCreatePhysics).  These profile events can be viewed in profile view in pvd, where the names of the profile events match the names of the tracked vehicle functions.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>In checked config PxVehicleDriveTank::allocate enforces the rule that only tanks with even numbers of wheels are legal and warns when this rule is broken.</li>
+				<li> In checked config PxVehicleDrive4W::allocate enforces the rule that only vehicles with 4 wheels or more are legal and warns when this rule is broken.</li>
+				<li>PxWheelQueryResult::localPose was incorrectly only set when the vehicle had a corresponding PxShape, as described by PxVehicleWheelsSimData::setWheelShapeMapping.  The localPose is now properly set independent of the mapping between wheel and shape.</li>
+				<li>Wheels resting on moving actors now properly observe the relative speed of the two actors when their relative speed is small.  This fixes a bug where at small relative speeds the velocity of the other actor was assumed to be zero.</li>
+				<li>Repx serialization failed to serialize PxVehicleWheelsSimData::setMinLongSlipDenominator, PxVehicleWheelsSimData::setSubStepCount, PxVehicleWheelsSimData::disableWheel, PxVehicleWheelsSimData::enableWheel and the number of entries in the engine torque curve.  These have now been fixed.</li>
+				<li>PxVehicleConcreteType used for vehicle serialization is now in the public API and has been moved to PxVehicleSDK.h.</li>
+				<li>Very small longitudinal and lateral slip angles could lead to numerical instabilities in some circumstances.  A threshold has been introduced to reject very small slip angles by setting them to zero when they are below the threshold.</li>
+				<li>Vehicles now account for rigid bodies that have been given a zero inertia component in order to lock rotation around the corresponding axis.</li>
+				<li>Fixed a bug where the sticky wheel constraints sometimes didn't function correctly.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Serialization</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>Added extension PxSerialization::isSerializable method to query whether a collection is serializable.</li>
-			<li>Added extension PxSerialization::complete which allows to prepare a collection for serialization.</li>
-			<li>Added extension PxSerialization::createNames which adds reference names to serializables.</li>
-			<li>Added extension PxCollectionExt::remove which removes all serializables of a certain type and optionally adds them to another collection.</li>
-			<li>Added extension function PxCollectionExt::releaseObjects to remove and release objects from a collection</li>
-			<li>Added class PxSerializationRegistry for handling custom serializable types.</li>
-			<li>Added PxSerialization::serializeCollectionToXml and PxSerialization::createCollectionFromXml</li>
-			<li>Included pre-built binary meta data with SDK at [path to installed PhysX SDK]/Tools/BinaryMetaData</li>
-			<li>Added class PxSerializer to provide serialization specific functionality to serializable classes.</li>
-			<li>Added classes PxSerializationContext and PxDeserializationContext to provide functionality for serialization and deserialization operations.</li>
-		</ul>
-		<li>Removed:</li>
-		<ul>
-			<li>Removed PxUserReferences class and PxPhysics::createUserReferences.</li>
-			<li>Removed RepX.h and unified serialization interfaces for xml and binary serialization.</li>
-		</ul>
-		<li>Changed:</li>
+		<h3>Cloth</h3>
 		<ul>
-			<li>PxCollection was reworked to improve reference management rendering PxUserReferences obsolete. PxCollection was also decoupled more from Serialization/Deserialization functionality. Serialization of incomplete collections fails now early at serialization as opposed to late at deserialization.</li>
-			<li>Replaced PxPhysics::createCollection with PxCreateCollection.<li>
-			<li>Replaced RepXCollection with PxCollection, and unified corresponding interfaces.</li>
-			<li>Replaced PxCollection::serialize with PxSerialization::serializeCollectionToBinary.</li>
-			<li>Replaced PxCollection::deserialize with PxSerialization::createCollectionFromBinary.</li>
-			<li>Replaced PxSerializable::collectForExport(PxCollection& c) with PxSerializer::requires. The new method works in a non-recursive way.</li>
-			<li>Replaced PxDumpMetaData with PxSerialization::dumpMetaData.</li>
-			<li>Replaced PxCollectForExportSDK with PxCollectionExt::createCollection(PxPhysics& sdk).</li>
-			<li>Replaced PxCollectForExportScene with PxCollectionExt::createCollection(PxScene& scene).</li>
-			<li>Moved PxCooking::createBinaryConverter to PxSerialization</li>
-			<li>Changed PxShape release semantics for shapes. As a consequence deserialized shapes are never autmatically released, but need to be released by the application. Exception: PxPhysics::release.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>The version number written to the fabric stream changed from PX_PHYSICS_VERSION to 1. A fabric can be created from streams written with version 3.3.0 and later until the stream format changes. Previously, the version of the producer and the consumer of the stream needed to match.</li>
+				<li>GPU cloth friction against convexes has been fixed.</li>
+				<li>A crash resulting from deleting a shape in proximity of a cloth with scene collision enabled has been fixed.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Cloth</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Scene Queries</h3>
 		<ul>
-			<li>Improved GPU cloth performance significantly with new parallel solver.</li>
-			<li>Added tether constraints, which allow minimum amount of cloth stretching even for large gravity scales. See PxClothFabric.</li>
-			<li>Added support for dynamic addition and deletion of collision primitives.  See PxCloth::addCollision* and PxCloth::removeCollision*.</li>
-			<li>Added triangle mesh collider support. See PxCloth::setCollisionTriangles. </li>
-			<li>Added support for self collision and inter-cloth collision. See PxCloth::setSelfCollision*() and PxScene::setClothInterCollision*().</li>
-			<li>Added direct access to CUDA particle data for graphics interoperability, see PxCloth::lockParticleData()</li>
-			<li>Added PxRegisterCloth to reduce binary size by stripping unused code on platforms where static linking is used.</li>
-			<li>Added methods setWakeCounter/getWakeCounter() (see the corresponding API documentation for details).</li>
+			<li>Fixed:</li>
 			<ul>
-				<li>It is illegal to call wakeUp/putToSleep/isSleeping() on a PxCloth that has not been added to a scene.</li>
+				<li>PxMeshQuery::sweep now respects PxHitFlag::eMESH_BOTH_SIDES, and supports double-sided input triangles.</li>
+				<li>PxRigidBodyExt::linearSweepSingle and PxRigidBodyExt::linearSweepMultiple now correctly use query filter data instead of simulation filter data if filter data is not provided.</li>
+				<li>The spec for raycasts whose origin is located inside a solid object (sphere, box, capsule, convex) has changed back to what it was in 3.3.0. It was accidentally changed in 3.3.1. See the manual for details.</li>
+				<li>Convex sweeps against heightfields worked only when the heightfield had the identity transform.  This has now been fixed to support arbitrary transforms again.</li>
 			</ul>
 		</ul>
-		<li>Changed:</li>
-		<ul>
-			<li>Cloth solver does not use fibers any more. See PxClothFabric for changes in fabric API.</li>
-			<li>Moved PxCooking.cookClothFabric() to extensions. See PxClothFabricCooker and PxClothFabricCreate.</li>
-			<li>PxClothMeshDesc has been moved to extensions and now supports both triangle and quad representations.  See PxClothMeshDesc.</li>
-			<li>The scaling of damping and stiffness coefficients has been separated from the solver frequency and can now be set indepedently using PxCloth::setStiffnessFrequency().</li>
-			<li>PxCloth::setInertiaScale() has been split into linear, angular, and centrifugal components.  See PxCloth::set*IntertiaScale.</li>
-			<li>Drag coefficient has been split into linear and angular coefficient. See PxCloth::setLinearDragCoefficient and PxCloth::setAngularDragCoefficient.</li>
-			<li>Renamed PxCloth::lockClothReadData() to lockParticleData().  Added support for update operations on the returned particle arrays (as an alternative to setParticles()).</li>
-			<li>PxCloth::wakeUp() does not have a parameter anymore. Use setWakeCounter() instead to set a specific value.</li>
-			<li>PxCloth::getNbCollisionSpherePairs() has been renamed to PxCloth::getNbCollisionCapsules()</li>
-		</ul>
-		<li>Fixed:</li>
+
+		<h3>Cooking</h3>
 		<ul>
-			<li>Fixed a crash bug in the clothing collision code appearing on iOS.</li>
+			<li>Added:</li>
+			<ul>
+				<li>Using PxMeshPreprocessingFlag::eFORCE_32BIT_INDICES will always cook meshes with 32-bit triangle indices.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>Convex hull cooking fix. Some input points could be ignored during cooking, fixed.</li>
+				<li>Inserted triangle meshes now respect 16 bit indices flag.</li>
+
+
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Rigid Bodies</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Geometry</h3>
 		<ul>
-			<li>The contact distance parameter for a limit is automatically estimated if not supplied in the constructor for the limit structure.</li>
-			<li>The new callback PxConstraintConnector::onOriginShift() has been introduced. It gets called for all constraints of a scene, when its origin is shifted.</li>
-			<li>New helper function PxRigidBodyExt::computeVelocityDeltaFromImpulse has been added.</li>
-			<li>Shapes may be declared as shared on creation, and then attached to multiple actors, see the user manual for restrictions.</li>
-			<li>Since a shape is no longer necessarily associated with a unique actor, references to shapes in callbacks from the engine are accompanied by the
-			a reference to the associated actor</li>
-			<li>Joints and contact modification now support tuning relative mass and inertia for the bodies on a per-contact basis. Inertia and mass can be tuned independently.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxHeightFieldDesc::thickness is now limited to [-PX_MAX_BOUNDS_EXTENTS, PX_MAX_BOUNDS_EXTENTS range]. (Previously unbounded).</li>
+			</ul>
 		</ul>
-		<li>Removed:</li>
+
+		<h3>Particles</h3>
 		<ul>
-			<li>PxShape::overlap(), PxShape::sweep() and PxShape::raycast() have been removed. Equivalent functionality is provided in PxGeometryQuery.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>Setting PxParticleReadDataFlag::eREST_OFFSET_BUFFER on a PxParticleBase instance that was not created with the per particle rest offset option (see PxPhysics::createParticleSystem, PxPhysics::createParticleFluid and PxParticleBaseFlag::ePER_PARTICLE_REST_OFFSET) is not supported. The unsupported configuration may have resulted in crashes. The SDK now rejects this configuration on calling PxParticleBase::setParticleBaseFlag and issues an appropriate warning to the error stream.</li>
+				<li>Performance improvements on Kepler and above GPUs running SPH.</li>
+				<li>In rare cases particle systems could access released memory when all interactions with a rigid body shape were lost.</li>
+			</ul>
 		</ul>
-		<li>Changed:</li>
+
+		<h3>Serialization</h3>
 		<ul>
-			<li>It is illegal to call resetFiltering() on a PxShape or PxParticleBase if they have not been added to a scene.</li>
-			<li>It is illegal to call addForce/addTorque/clearForce/clearTorque() on a PxRigidBody that has not been added to a scene.</li>
-			<li>The sleep behavior of dynamic rigid bodies has changed significantly (for details on the current behavior see the API documentation of isSleeping(), wakeUp(), putToSleep(), setKinematicTarget(), PxRigidBodyFlag::eKINEMATIC, ...). Among the changes are:
-			<ul>
-				<li>The methods setWakeCounter/getWakeCounter() have been added for PxRigidDynamic and PxArticulation objects (see the corresponding API documentation for details).</li>
-				<li>The wakeUp() method of PxRigidDynamic and PxArticulation has lost the wake counter parameter. Use setWakeCounter() instead to set a specific value.</li>
-				<li>It is illegal to call wakeUp/putToSleep/isSleeping() on a PxRigidDynamic or PxArticulation that has not been added to a scene.</li>
-				<li>Putting a dynamic rigid actor to sleep will clear any pending force updates.</li>
-				<li>Switching a dynamic actor to kinematic will put the actor to sleep immediately.</li>
-				<li>Switching a kinematic actor back to dynamic will not affect the sleep state (previously the actor was woken up).</li>
-				<li>Calling wakeUp/putToSleep() on a kinematically controlled dynamic actor is not valid any longer. The sleep state of a kinematic actor is solely defined based on whether a target pose has been set (see API documentation of isSleeping() for details).</li>
-				<li>A call to PxRigidBody::setCMassLocalPose() does not wake up the actor anymore. Note: this also affects related methods in PhysXExtensions like PxRigidBodyExt::updateMassAndInertia() etc.</li>
-				<li>If a non-zero velocity or force is set through PxRigidBody::setLinearVelocity(), ::setAngularVelocity(), ::addForce() or ::addTorque(), the actor will get woken up automatically even if the autowake parameter is false.</li>
-				<li>PxRigidBody::clearForce() and ::clearTorque() do not have the autowake parameter, to optionally wake the actor up, anymore. These methods will not change the sleep state any longer. Call ::wakeUp() subsequently to get the old default behavior.</li>
-				<li>Adding or removing a PxConstraint/PxJoint to/from the scene does not wake the connected actors up automatically anymore.</li>
-				<li>It is now possible to avoid automatic wake up of previously touching objects on scene removal. See the additional parameter wakeOnLostTouch in PxScene::removeActor(), ::removeArticulation(), ::removeAggregte(), PxRigidActor::detachShape().</li>
-			</ul>
-			</li>
-			<li>PxJointLimit and PxJointLimitPair are now PxJointLinearLimit, PxJointLinearLimitPair, PxJointAngularLimitPair, depending on whether the limit is linear or angular.</li>
-			<li>Joints now solve for the entire position error rather than a ratio of 0.7 of it. The flag PxConstraintFlag::eDEPRECATED_32_COMPATIBILITY can be used to restore this behavior</li>
-			<li>PxConstraintFlag::Type has been renamed to PxConstraintFlag::Enum<li>
-			<li>The spring constant parameter in joints and articulations that was previously 'spring' is now 'stiffness'.</li>
-			<li>The tangential spring constant parameter in articulations that was previously 'tangentialSpring' is now 'tangentialStiffness'.</li>
-			<li>Constraints do not respect PxDominanceGroup settings. Use PxJoint::setInvMassScale and setInvInertiaScale</li>
-			<li>Shapes are reference counted. PxShape::release() now decrements the reference count on a shape, and its use is deprecated for detaching a shape from its actor - use detachShape() instead.</li>
-			<li>Shape creation methods do not take a local transform parameter anymore. Instead PxShapeFlags can be specified. Triangle meshes, height fields and plane geometry shapes cannot be combined with non-kinematic PxRigidDynmic actors if PxShapeFlag::eSIMULATION_SHAPE is specified. Corresponding calls to PxRigidActor::createShape() or PxRigidActor::attachShape() are not supported.</li>
-			<li>PxShape::getActor() now returns a pointer, which is NULL if the shape is shareable.</li>
-			<li>PxShape::getWorldBounds() has been replaced with PxShapeExt::getWorldBounds().</li>
-            <li>PxContactPoint has been renamed PxFeatureContact.</li>
-            <li>The internal format for contact storage has been modified; applications directly accessing the internal contact representation rather than PxContactPair::extractContacts should be modified accordingly.</li>
-			<li>Friction mode flags eENABLE_ONE_DIRECTIONAL_FRICTION and eENABLE_TWO_DIRECTIONAL_FRICTION have been replaced by PxFrictionType::Enum PxSceneDesc::frictionType.</li>
-			<li>PxSceneDesc::contactCorrelationDistance has been deprecated.</li>
-			<li>PxSceneDesc::contactCorrelationDistance no longer has an influence on how many friction anchors are created in a single frame, only on when they are removed in later frames.  This may cause a very minor change in friction behavior.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxBinaryConverter::convert potentially failed in checked mode with allocators that don't set 0xcd pattern. This has been fixed now.</li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+
+	</blockquote>
+
+
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3.1</h1>
+	<h2 style="TEXT-ALIGN: center">December 2013</h2>
+
+	<h2>Supported Platforms</h2>
+	<blockquote>
+		<h3>Runtime</h3>
 		<ul>
-			<li>Rigid bodies now properly accumulate the forces/torques that are applied with addForce/addTorque while scene insertion is still pending.  This affects bodies added to the scene while the scene is simulating and then given forces/torques with addForce/addTorque while the scene is simulating.  These accumulated forces and torques are applied during the next simulate() call.  Prior to this fix the forces/torques that accumulated while scene insertion was pending were lost and never applied.</li>
-			<li>Its now possible to serialize collections with jointed actors without including the corresponding joints in the collection. The deserialized actors will not be jointed anymore.</li>
-			<li>Joint drive force limits are actual force limits rather than impulse limits. Set the flag PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES to false to support legacy behavior</li>
-			<li>Angular drive constraints for revolute joints were wrongly computed, resulting in a negative angular velocity when a positive drive target was set.</li>
-			<li>Scene addActors will now correctly remove all actors that were passed to add, if some insert failed.</li>
-			<li>Contact modification now enforces max impulse field correctly. Previously, it only enforced it if max impulse was set to 0.</li>
-			<li>Contact modification now supports target velocity in all directions. Previously, it only enforced the target velocity components that were perpendicular to the contact normal.</li>
-			<li>Jittering of small spheres & capsules on very large triangles has been fixed.</li>
-			<li>Setting sleep threshold to 0 now guarantees that bodies won't fall asleep even if their kinetic energy reaches exactly 0.</li>
+			<li>Apple iOS</li>
+			<li>Apple Mac OS X</li>
+			<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
+			<li>Linux (tested on Ubuntu)</li>
+			<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
+			<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
+			<li>Microsoft XBox One</li>
+			<li>Microsoft XBox 360</li>
+			<li>Nintendo Wii U</li>
+			<li>Sony Playstation 3</li>
+			<li>Sony Playstation 4 (SDK only, no samples)</li>
+			<li>Sony Playstation Vita</li>
 		</ul>
-		<li>Deprecated:</li>
+		<h3>Development</h3>
 		<ul>
-			<li>PxShape::release() now decrements the reference count on a shape, and its use is deprecated for detaching a shape from its actor - use detachShape() instead.</li>
-			<li>PxJoint::getType() is deprecated - use getConcreteType() instead.</li>
-			<li>PxConstraintFlag::eREPORTING is deprecated - constraints always generate force reports</li>
-			<li>PxConstraintDominance is deprecated - use PxDominanceGroupPair instead.</li>
+			<li>Microsoft Windows XP or later</li>
+			<li>Microsoft Visual Studio 2008, 2010, 2012 (Windows RT only)</li>
+			<li>Xcode 4.6|5.0|5.0.1</li>
 		</ul>
-	</ul>
+	</blockquote>
+	<h2>Known Issues</h2>
+	<blockquote>
+		<ul></ul>
+	</blockquote>
+	<h2>Changes and Resolved Issues</h2>
+	<blockquote>
 
-	<h3>Scene queries</h3>
-	<ul>
-		<li>Added:</li>
-		<ul>
-			<li>PxVolumeCache, a volumetric cache for local collision geometry.</li>
-			<li>Parameter inflation was added to some PxGeometryQuery functions.</li>
-			<li>Flag eMESH_BOTH_SIDES can be now used to control triangle mesh culling for raycasts and sweeps.</li>
-			<li>Added PxBatchQueryMemory as a part of PxBatchQueryDesc, to allow memory buffers to be set before execution.</li>
-			<li>PxBatchQueryDesc() constructor now requires 3 parameters at initialization - see migration guide for more details.</li>
-			<li>PxBatchQuery::raycast, sweep, overlap calls will now issue a warning and discard the query when over maximum allowed queries.</li>
-			<li>There is a new flag to allow manually updating the scene query representation, see: PxSceneFlags::eENABLE_MANUAL_SQ_UPDATE and PxScene::flushQueryChanges().</li>
-			<li>Added PxScene::forceDynamicTreeRebuild() function to immediately rebuild the scene query structures.</li>
-			<li>Added bool PxSweepHit::hadInitialOverlap() returning true if a sweep hit occurred early due to initial overlap at distance=0.</li>
-		</ul>
-		<li>Removed:</li>
+		<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
+
+		<h3>General</h3>
 		<ul>
-			<li>PxBatchQuery::linearCompoundGeometrySweepSingle and PxBatchQuery::linearCompoundGeometrySweepMultiple functions are no longer supported.</li>
-			<li>Globals PxPS3ConfigParam::eSPU_OVERLAP, eSPU_RAYCAST, eSPU_SWEEP that were previous set via setSceneParamInt call are replaced with PxBatchQueryDesc::runOnSpu. See migration guide for more details.</li>
+			<li>Added:</li>
+			<ul>
+				<li>The friction model can now be changed after scene instantiation with PxScene::setFrictionType.  The friction model can also be queried with PxScene::getFrictionType.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>PxDefaultSimulationFilterShader now supports particles and cloth as well.</li>
+				<li>PxSimulationFilterCallback: the provided actor and shape pointers are now defined as const. Note: this is no behavior change, it was never allowed to write to those objects from within the callback.</li>
+				<li>The PxTriangleMeshFlag::eHAS_16BIT_TRIANGLE_INDICES and PxTriangleMeshFlag::eHAS_ADJACENCY_INFO enums have been deprecated. Please use PxTriangleMeshFlag::e16_BIT_INDICES and PxTriangleMeshFlag::eADJACENCY_INFO instead.</li>
+				<li>Removed following functions from the API for platforms which do not support CUDA: PxGetSuggestedCudaDeviceOrdinal, PxCreateCudaContextManager, PxLoadPhysxGPUModule.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>Fixed concurrency issue on windows. Calling PxScene::simulate on multiple scenes concurrently may have caused a deadlock. This only happened if the scenes shared a single PxCpuDispatcher and the dispatcher was configured to use one worker thread only.</li>
+			</ul>
 		</ul>
-		<li>Changed:</li>
+
+		<h3>Rigid Bodies</h3>
 		<ul>
-			<li>Scene Query raycastAny/Single/Multiple APIs were merged into a single raycast() call (same for overlaps and sweeps). Please refer to user and migration guides for details.</li>
-			<li>Scene Query raycast() API now uses a PxRaycastBuffer or a PxRaycastCallback parameter for reporting hits. Blocking hits are now reported separately from toching and PxRaycastCallback class supports reporting an unbounded number of results (same for overlaps and sweeps).</li>
-			<li>A const templated PxRaycastBufferN<int> object was added to allow convenient creation of fixed size scene query touch buffers. Same for overlaps and sweeps.</li>
-			<li>Support for compound sweeps was moved out from core SDK to extensions.</li>
-			<li>Support for compound sweeps was moved from PxScene to extensions (see PxRigidBodyExt::linearSweepSingle, PxRigidBodyExt::linearSweepMultiple).</li>
-			<li>PxQueryFilterCallback::preFilter now passes an actor pointer as well as a shape pointer.</li>
-			<li>PxSceneQueryFlag::eINITIAL_OVERLAP and PxSceneQueryFlag::eINITIAL_OVERLAP_KEEP have been replaced with PxHitFlag::eINITIAL_OVERLAP_DISABLE and PxLocationHit::hadInitialOverlap(). Note that checking for initial overlap is now the defaut for sweeps.</li>
-			<li>Sweeps in 3.3 execute using a new faster code path, in some cases with reduced precision. If you encounter precision issues not previously experienced in earlier versions of PhysX, use ePRECISE_SWEEP flag to enable the backwards compatible more accurate sweep code.</li>
-			<li>The default behavior for overlap queries with query filters returning eBLOCK has changed to only return one of eBLOCK hits. Please refer to the migration guide for details.</li>
+			<li>Added:</li>
+			<ul>
+				<li>The projection direction for constraints can now be specified through the flags PxConstraintFlag::ePROJECT_TO_ACTOR0, ::ePROJECT_TO_ACTOR1.</li>
+				<li>A parameter has been added to PxRigidBodyExt::updateMassAndInertia() and ::setMassAndUpdateInertia() to optionally take non-simulation shapes into account for computing the mass and the inertia tensor of a rigid body.</li>
+				<li>It is now possible to retrieve additional information in contact reports. See the API documentation of PxContactPairHeader.extraDataStream, PxPairFlag::ePRE_SOLVER_VELOCITY, ::ePOST_SOLVER_VELOCITY, ::eCONTACT_EVENT_POSE for details.</li>
+				<li>The contact report system has been extended to support multiple notification events if the same two objects collide multiple times in multipass CCD scenarios. See the API documentation of PxPairFlag::eNOTIFY_TOUCH_CCD for details.</li>
+			</ul>
+
+			<li>Changed:</li>
+			<ul>
+				<li>If touching objects were added to the scene asleep and one of them got woken up, then all contact pairs of the touching objects which contained a static rigid body were resolved with a delay of one simulation step. Now these pairs get resolved without delay in the next simulation step.</li>
+				<li>If touching objects were added to the scene asleep, eNOTIFY_TOUCH_FOUND contact reports were sent out for pairs of dynamic rigid bodies if requested. These reports will not be sent at the end of the simulation step after insertion anymore but rather at the end of the simulation step after the touching objects have been woken up.</li>
+				<li>Rigid bodies now permit zeroes in passed to setMass and setMassSpaceInertiaTensor. Zeroes are interpreted to indicate infinite mass or infinite moment of inertia around a given principal axis of inertia. Previously, zeroes were illegal values to these methods. Note that zeroes are still illegal for instances of PxArticulationLink.</li>
+			</ul>
+
+			<li>Fixed:</li>
+			<ul>
+				<li>Reading back the kinematic target in the PxSimulationEventCallback::onContact() callback through PxRigidDynamic::getKinematicTarget() will now work.</li>
+				<li>Contact reports are no longer generated for contact pairs involving two sleeping kinematic actors or for pairs involving a sleeping kinematic actor in contact with a static actor.  This fixes a bug that was introduced in 3.3.0. </li>
+				<li>No PxPairFlag::eNOTIFY_TOUCH_LOST event was sent in contact reports if a pair of sleeping rigid bodies got woken up after setting the pose on one of them (with autowake parameter set to false) and if the bounding boxes of the two objects still overlapped.</li>
+				<li>No PxPairFlag::eNOTIFY_TOUCH_PERSISTS event was sent in contact reports during the first simulation step after a pair of sleeping rigid bodies got woken up.</li>
+				<li>The inertia tensor computation for convex meshes has been adjusted to be more stable in certain cases where floating point precision issues arise. Furthermore, a fallback routine has been added to use an approximation if the diagonalized inertia tensor still ends up with invalid entries.</li>
+				<li>PxRigidBody::clearForce() and ::clearTorque() did not properly clear the respective properties if used with force mode PxForceMode::eIMPULES or PxForceMode::eVELOCITY_CHANGE.</li>
+				<li>Setting PxSceneFlag::eENABLE_KINEMATIC_STATIC_PAIRS also enabled PxSceneFlag::eENABLE_KINEMATIC_PAIRS internally and vice versa.</li>
+				<li>Missing validation checks for some joint set() methods have been added. Similarly to other API calls, when validation fails in the checked build PhysX will report an error and return without updating the joint.</li>
+				<li>Switching a kinematic rigid body to dynamic could lead to a crash in a subsequent simulation step, if the kinematic was moved and connected to another kinematic through a breakable PxConstraint/PxJoint.</li>
+				<li>Deleting a breakable PxConstraint/PxJoint while the simulation is running could lead to a crash if the PxConstraint/PxJoint broke in the same simulation step.</li>
+				<li>A bug in the PxScene::addBroadPhaseRegion() function, that could lead to a crash when using 'populateRegion=true', has been fixed.</li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+
+		<h3>Particles</h3>
 		<ul>
-			<li>Scene Query performance was significantly improved in a variety of scenarios.</li>
-			<li>Fixed a bug in capsule/mesh overlap code that occasionally caused unreported and misreported faces.</li>
-			<li>Fixed a crash in raycastMultiple when a convex was hit by the ray and eMESH_MULTIPLE flag was specified as a query flag.</li>
-			<li>Fixed a rare crash in heightfield raycast code.</li>
-			<li>Internal limit of 65536 results has been removed.</li>
-			<li>Accuracy in sphere/capsule-vs-triangle overlap and sweep tests has been improved.</li>
+			<li>Added:</li>
+			<ul>
+				<li>Added triangle mesh cache statistics for GPU particles.  Triangle mesh cache statistics are also captured by PVD as part of simulation statistics.</li>
+				<li>Added new option to query approximate particle velocities relative to colliding rigid actors.  This can be used for debris rotation on moving objects.  Enable with PxParticleReadDataFlag::eCOLLISION_VELOCITY_BUFFER and read from PxParticleReadData::collisionVelocityBuffer.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>Fixed a bug which might lead to GPU particle pipeline failures on low end GPUs.</li>
+				<li>Enabled warning when a spatial data structure overflow occured for GPU particles (see the guide for more information).</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Cooking</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Cloth</h3>
 		<ul>
-			<li>Added support for convex hull creation with limited output vertices count.</li>
-			<li>Added support for convex hull creation directly from polygons rather than triangles.</li>
-			<li>Added support function computeHullPolygons in cooking, that creates hull polygons from given vertices and triangles. The resulting polygons can be used to create the convex hull directly.</li>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxFilterFlag::eSUPPRESS was ignored for collision pairs that involved a PxCloth object. This does work now, however, please note that PxFilterFlag::eSUPPRESS is treated as PxFilterFlag::eKILL for pairs with a PxCloth object.</li>
+			</ul>
 		</ul>
-		<li>Changed:</li>
+
+		<h3>Serialization</h3>
 		<ul>
-			<li>Changed convex hull volume integrals.</li>
-			<li>PxCookingParams constructor requires now PxTolerancesScale as an additional parameter. This enables us to perform further checks on the triangles during cooking. A warning will be emitted to the error stream if too huge triangles were found. This will ensure better simulation stability.</li>
+			<li>Added:</li>
+			<ul>
+				<li>Support for binary compatibility between different sdk releases and patches has been added (PX_BINARY_SERIAL_VERSION). The current sdk version can load binary data of the older sdk versions listed in the documentation of PX_BINARY_SERIAL_VERSION. </li>
+				<li>SnippetLoadCollection has been added. It illustrates loading repx or binary serialized collections and instatiating the objects in a scene. It only compiles and runs on authoring platforms (windows, osx and linux). </li>
+				<li>SnippetConvert has been added. It illustrates how to convert PhysX 3 serialized binary files from one platform to another. It only compiles and runs on authoring platforms (windows, osx and linux). </li>
+			</ul>
+			<li>Deprecated:</li>
+			<ul>
+				<li>Method PxCollection::addRequire is deprecated, use PxCollection::add and PxCollection::contains instead. </li>
+				<li>Method PxCollection::createBinaryConverter(PxSerializationRegistry&) is deprecated, use PxCollection::createBinaryConverter() instead. </li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+
+		<h3>Character controller</h3>
 		<ul>
-			<li>Optimized heightfield load code for no-endian conversion case.</li>
+			<li>Added:</li>
+			<ul>
+				<li>PxControllerManager::setPreventVerticalSlidingAgainstCeiling() has been added, to control the behaviour of characters against ceilings.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Triangle meshes</h3>
-	<ul>
-		<li>Added:</li>
+
+		<h3>Vehicles</h3>
 		<ul>
-			<li>Added PxTriangleMeshFlag::eHAS_ADJACENCY_INFO flag for adjacency information checks.</li>
+			<li>Added:</li>
+			<ul>
+				<li>Vehicles may now be updated concurrently through the addition of a new function PxVehiclePostUpdates and passing a PxVehicleConcurrentUpdateData array to PxVehicleupdates.</li>
+				<li>A new snippet SnippetVehicleMultiThreading has been added to show the operation of concurrent vehicle updates.</li>
+				<li>PxVehicleDriveTankControl and PxVehicleDriveTankControlModel now have improved doxy comments.</li>
+				<li>A new function PxVehicleUpdateCMassLocalPose has been added to help update a vehicle after the center of mass pose of the vehicle's actor has been modified.</li>
+				<li>PxWheelQueryResult now records the local pose of the wheel.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>PxVehcicleDrive4W::setup now tests that at least 4 wheels are specified and returns wtih an error message if numWheels < 4.  It is only possible to create a PxVehicleDrive4W instance with less than 4 active wheels by disabling wheels after instantiating a 4-wheeled car.</li>
+				<li>In debug and checked build configurations PxVehicleComputeSprungMasses now reports whether the sprung masses were successfully computed.   Warnings are passed to the error stream in checked configuration if the function does not complete successfully.  Apart from error checking the operation of the function is unchanged.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>The doxy comment describing the default setting for PxVehicleWheelsSimData::setWheelShapeMapping was incorrect.  This now correctly documents the default mapping as PxVehicleWheelsSimData::setWheelShapeMapping(i,i).</li>
+				<li>Suspensions raycasts that start inside geometry are ignored for all geometry types.  Prior to this release this was true for all geometry types except for heightfields and triangle meshes.  This inconsistency has now been fixed so that all geometry types obey the rule that suspension raycasts starting inside geometry are neglected.</li>
+			</ul>
 		</ul>
-		<li>Removed:</li>
+
+		<h3>Scene queries</h3>
 		<ul>
-			<li>Removed has16BitTriangleIndices(), replaced by triangleMesh->getTriangleMeshFlags() & PxTriangleMeshFlag::eHAS_16BIT_TRIANGLE_INDICES.</li>
+			<li>Added:</li>
+			<ul>
+				<li>Added eMTD flag. If an initial overlap is detected, this flag triggers the sweep to compute the MTD (Minimum Translation Direction), which can be used to de-penetrate the query shape from the shape with which an initial overlap was found. In this case, the distance reported will be negative. This negative distance can be used to scale the reported normal to generate the translation vector required to de-penetrate the query shape. </li>
+				<li>Added PxTriangle::pointFromUV.</li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>A rare ray-capsule intersection bug has been fixed, when the capsule's height is close to zero.</li>
+				<li>A capsule-capsule distance bug has been fixed, when the tested capsules are large and parallel.</li>
+				<li>Raycasts against heightfields now correctly return triangle UVs.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>
+					PxBatchQuery::raycast, overlap and sweep previously had an incorrect const modifier indicating that these methods were safe to call from multiple threads simultaneously. This has been removed.
+					Multiple batch queries can still be executed (via PxBatchQuery::execute()) in parallel.
+				</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Particles</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Cooking</h3>
 		<ul>
-			<li>Direct read access to CUDA particle data has been added for graphics interoperability, see PxParticleBase::lockParticleReadData(PxDataAccessFlags flags) and PxParticleFluid::lockParticleFluidReadData(PxDataAccessFlags flags)</li>
-			<li>Added PxRegisterParticles to reduce binary size by stipping unused code on platforms where static linking is used.</li>
-			<li>Added setExplicitCudaFlushCountHint to allow early flushing of the cuda push buffer.</li>
-			<li>Added caching of triangle meshes. setTriangleMeshCacheSizeHint is supported on Kepler and above GPUs.</li>
+			<li>Added:</li>
+			<ul>
+				<li>PxCookingParams::meshSizePerformanceTradeOff parameter can be used to make the mesh smaller at the expense of reduced simulation and scene query performance (or the other way around).</li>
+				<li>PxCookingParams::meshCookingHint parameter can be used to specify mesh hierarchy construction preference (cooking speed or simulation speed).</li>
+				<li>PxMeshPreprocessingFlag::eDISABLE_CLEAN_MESH disables mesh clean proces. Vertices duplicities are not searched, huge triangles test is not done. Vertices welding is not done. Does speed up the cooking.</li>
+				<li>PxMeshPreprocessingFlag::eDISABLE_ACTIVE_EDGES_PRECOMPUTE disables vertex edge precomputation. Makes cooking faster but slow up contact generation.</li>
+				<li>PxPhysicsInsertionCallback adds the support for inserting cooked triangle mesh or height field directly into PxPhysics without the stream serialization.</li>
+				<li>PxCooking::createTriangleMesh creates triangle mesh and inserts it to PxPhysics without using the stream serialization.</li>
+				<li>PxCooking::createHeightField creates height field and inserts it to PxPhysics without using the stream serialization.</li>
+				<li>PxCooking::validateTriangleMesh validates mesh in separate function before it can be cooked without the mesh cleaning.</li>
+				<li>PxConvexFlag::eCHECK_ZERO_AREA_TRIANGLES checks and removes almost zero-area triangles during the computation of the convex hull.</li>
+				<li>PxCookingParams::areaTestEpsilon triangle area size was added. This epsilon is used for the zero-area test in the computation of the convex hull.</li>
+			</ul>
+			<li>Changed:</li>
+			<ul>
+				<li>PxCooking::computeHullPolygons does now return also vertices used by the polygons. Redundant vertices are removed.</li>
+				<li>PxCooking::cookConvexMesh now returns a PxConvexMeshCookingResult::Enum with additional error information.</li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+
+		<h3>Aggregates</h3>
 		<ul>
-			<li>Creating and immediately setting the position of particles failed and possibly crashed with GPU acceleration enabled. This would only happen after one or more simulation updates.</li>
-			<li>Creating many spread out particles might have crashed the GPU pipeline.</li>
+			<li>Added:</li>
+			<ul>
+				<li>PxSceneLimits has a new member variable maxNbAggregates.  Setting this value to a good approximation of the peak number of aggregates will avoid the need for internal run-time allocations that can occur when aggregates are added to the scene. </li>
+			</ul>
+			<li>Fixed:</li>
+			<ul>
+				<li>PxScene::removeActor will auotmatically remove that actor from all PxAggregate instances that contain the removed actor.  Likewise, PxScene::removeArticulation will automatically remove all articulation links from all relevant aggregates.  This fix upholds the rule that all actors of an aggregate must be in same scene.</li>
+				<li>The condition of an internal assert that triggered after calling PxScene::addAggregate has been corrected. This assert triggered when an aggregate was added to the scene after removal of all aggregates from the scene.  The operation of the function PxScene::addAggregate is unchanged apart from the asserted condtition.</li>
+			</ul>
 		</ul>
-	</ul>
 
-	<h3>Broad Phase</h3>
-	<ul>
-		<li>Added:</li>
+		<h3>Samples</h3>
 		<ul>
-			<li>The SDK now supports multiple broad-phase algorithms (SAP & MBP). See Release Highlights, PxBroadPhaseType and PxBroadPhaseDesc.</li>
-			<li>PxVisualizationParameter::eMBP_REGIONS has been added to visualize MBP regions</li>
+			<li>Changed:</li>
+			<ul>
+				<li>Starting with Release 302 drivers, application developers can direct the Optimus driver at runtime to use the High Performance Graphics to render any application - even those applications for which there is no existing application profile. The samples now make use of this feature to enable High Performance Graphics by default.</li>
+			</ul>
 		</ul>
-		<li>Fixed:</li>
+	</blockquote>
+
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+	<br>
+	<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+	<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.3</h1>
+	<h2 style="TEXT-ALIGN: center">September 2013</h2>
+
+	<h2>Release Highlights</h2>
+	<blockquote>
 		<ul>
-			<li>The sap broadphase now gracefully handles PxShape instances whose axis-aligned bounding boxes have min/max limits with value  +/- PX_MAX_F32 or QNAN or INF. Such bounds values can occur if a PxShape is given a global transform that is either illegal or close to the upper limit of the floating point range. Prior to this release, the sap broadphase could crash when the axis-aligned bounds of shapes had values that weren't within the floating point range.  This has now been fixed.  The overlap pairs reported by the broadphase for bounds with such values is undefined.</li>
+			<li>Added PhysXDevice/64.dll to the PC packages. See the Windows readme for details.</li>
+			<li>Added support for the NVIDIA Kepler GPU architecture.</li>
+			<li>Added support for the Nintendo Wii U console.</li>
+			<li>Added support for Windows 8 Modern UI applications (ARM and x86).</li>
+			<li>Ported our SIMD library to the ARM NEON architecture.</li>
+			<li>Multi Box Pruning (MBP) is offered as an alternative broad phase algorithm to Sweep And Prune (SAP). MBP shows improved performance when most objects are moving or when inserting large numbers of objects. SAP can be faster in scenarios with few moving (many sleeping) objects.</li>
+			<li>Significant performance and stability optimizations for rigid body solver.</li>
+			<li>New function to compute the minimum translational distance and direction to separate two overlapping geometry objects.</li>
+			<li>New 'PCM' contact generation mode which is often faster and more robust than the still available legacy path.</li>
+			<li>Improved performance of scene queries and contact reports.</li>
+			<li>Improved behavior and performance of Continuous Collision Detection (CCD).</li>
+			<li>Reduced memory footprint of rigid body classes.</li>
+			<li>Added support for sharing shapes among rigid bodies.</li>
+			<li>Significantly improved cloth behavior and GPU performance.</li>
+			<li>Added support for cloth colliding against itself, other cloth instances, and scene geometry.</li>
+			<li>Improved useability of binary and xml serialization.</li>
+			<li>Memory can be saved for objects that do not participate in the simulation and are used for scene queries only. For details see the new flag PxActorFlag::eDISABLE_SIMULATION.</li>
 		</ul>
-	</ul>
+	</blockquote>
 
-	<h3>Vehicles</h3>
-	<ul>
-		<li>Added:</li>
+	<h2>Supported Platforms</h2>
+	<blockquote>
+		<h3>Runtime</h3>
 		<ul>
-			<li>Vehicles now support serialization. A PxSerializationRegistry instance may be passed into PxInitVehicleSdk and PxCloseVehicleSdk in order to enable support.</li>
-			<li>Vehicle telemetry graphs can now be queried per channel for the most recent value with the function PxVehicleGraph::getLatestValue.</li>
-			<li>New vehicle PxVehicleDriveNW type has been introduced.  This class makes use of a new differential type PxVehicleDifferentialNW, which allows specified wheels to be equally coupled to the differential, and allows all for some or all of the N wheels to be driven. </li>
-			<li>Support for camber angles has been added to the PxVehicleSuspensionData class.</li>
-			<li>Moment of inertia parameter has been added to the PxVehicleEngineData class. Prior to this a value of 1.0 was assumed internally.  A default value of 1.0 has been used in the constructor for backwards compatability.</li>
-			<li>Vehicle manual contains new section describing the conversion of default vehicle parameter values from SI units to any system of units with particular reference to the use of centimetres instead of metres .</li>
-			<li>The SI units of each parameter in PxVehicleComponents.h has been documented. </li>
-			<li>Vehicle manual contains updated troubleshooting section.</li>
-			<li>The requirements for disabled wheels (PxVehicleWheelsSimData::disableWheel) have been documented to make it clear that disabled wheels must be no longer associated with a PxShape, must have zero rotation speed, and must be decoupled from the differential.  This is also now discussed in the guide.</li>
-			<li>Wheel raycasts documentation has been improved to clarify the start and end points of each raycast.</li>
-			<li>Suspension line raycasts do not need to be performed for each vehicle prior to update with PxVehicleUpdates.  This feature is implemented with a boolean array passed as an extra function argument to PxVehicleSuspensionRaycasts.  This feature is useful for vehicles that require only low level of detail.</li>
-			<li>The clutch model now supports two accuracy modes (PxVehicleClutchAccuracyMode::eESTIMATE and PxVehicleClutchAccuracyMode::eBEST_POSSIBLE).  If the estimate mode is chosen the computational cost and accuracy of the clutch can be tuned with PxVehicleClutchData::mEstimateIterations.</li>
-			<li>PxVehicleSuspensionData now has a function setMassAndPreserveNaturalFrequency.  This modifies the mass and stiffness of the spring in a way that preserves the spring natural frequency.</li>
-			<li>A new helper function PxVehicleCopyDynamicsData has been added that allows dynamics data such as engine rotation speed, wheel rotation speed, gear etc to be copied from one vehicle to another of the same type.  This is particularly useful if a vehicle has a number of different versions where each represents a different level of detail.</li>
-			<li>A new function PxVehicleWheelsSimData::copy has been added to allow per wheel dynamics data to be copied from one vehicle to another.</li>
-			<li>The vehicle manual contains a new section "Level of Detail" describing the available options for vehicles that require only a low level of detail.</li>
-			<li>PxVehicleTireLoadFilterData now requires that mMinNormalisedLoad is greater than or equal to zero.</li>
-			<li>PxVehicleTireLoadFilterData now has a new member variable mMinFilteredNormalisedLoad.  This value describes the filtered normalised load that occurs when the normalised is less than or equal to mMinNormalisedLoad.</li>
-			<li>PxVehicleWheelsSimData now has a new function setMinLongSlipDenominator.  This can be used to tune stability issues that can arise when the vehicle slows down in the absence of brake and drive torques.</li>
-			<li>A new section "PxVehicleAutoBoxData" has been added to the vehicle tuning guide to describe operation of the automatic gearbox.</li>
-			<li>A new section "The Vehicle Under-steers Then Over-steers" has been added to the vehicle troubleshooting guide to describe steps to avoid twitchy handling on bumpy surfaces.</li>
-			</li>A new section "The Vehicle Never Goes Beyond First Gear" has been added to the vehicle troubleshooting guide to describe a common scenario that occurs when the automatic gearbox is given a latency time that is shorter than the time taken to complete a gear change.</li>
-			<li>A new section "The Vehicle Slows Down Unnaturally" has been added to the vehicle troubleshooting guide to describe the steps that can be taken to help the vehicle slow down more smoothly.</li>
-			<li>A number of vehicle snippets have been added.</li>
+			<li>Apple iOS</li>
+			<li>Apple Mac OS X</li>
+			<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
+			<li>Linux (tested on Ubuntu)</li>
+			<li>Microsoft Windows XP or later (NVIDIA Driver version R304 or later is required for GPU acceleration)</li>
+			<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
+			<li>Microsoft XBox One</li>
+			<li>Microsoft XBox 360</li>
+			<li>Nintendo Wii U</li>
+			<li>Sony Playstation 3</li>
+			<li>Sony Playstation 4 (SDK only, no samples yet)</li>
+			<li>Sony Playstation Vita</li>
 		</ul>
-		<li>Changed:</li>
+		<h3>Development</h3>
 		<ul>
-			<li>Minor api change for consistency: PxVehicleDrive4WWheelOrder has been introduced to replace the enum beginning with PxVehicleDrive4W::eFRONT_LEFT_WHEEL.</li>
-			<li>Minor api change for consistency: PxVehicleDrive4WControl has been introduced to replace the enum beginning with PxVehicleDrive4WControl::eANALOG_INPUT_ACCEL.</li>
-			<li>Minor api change for consistency: PxVehicleDriveTankWheelOrder has been introduced to replace the enum beginning with PxVehicleDriveTankWheelOrder::eFRONT_LEFT.</li>
-			<li>Minor api change for consistency: PxVehicleDriveTankControl has been introduced to replace the enum beginning with PxVehicleDriveTank::eANALOG_INPUT_ACCEL.</li>
-			<li>Minor api change for consistency: PxVehicleDriveTankControlModel has been introduced to replace the enum beginning with PxVehicleDriveTank::eDRIVE_MODEL_STANDARD.</li>
-			<li>Minor api change for consistency: PxVehicleTypes has been introduced to replace the enum beginning with eVEHICLE_TYPE_DRIVE4W.</li>
-			<li>Minor api change for consistency: PxVehicleWheelGraphChannel has been introduced to replace the enum beginning with PxVehicleGraph::eCHANNEL_JOUNCE.</li>
-			<li>Minor api change for consistency: PxVehicleDriveGraphChannel has been introduced to replace the enum beginning with PxVehicleGraph::eCHANNEL_ENGINE_REVS.</li>
-			<li>Minor api change for consistency: PxVehicleGraphType has been introduced to replace the enum beginning with PxVehicleGraph::eGRAPH_TYPE_WHEEL.</li>
-			<li>PxVehicleUpdates now checks in checked and debug config that the wheel positioning of the driven wheels in a PxVehicleDrive4W obey the wheel ordering specified in PxVehicleDrive4WWheelOrder.  Vehicles that are back-to-front or right-to-left are also allowed. A warning is issued if the check fails.</li>
-			<li>PxVehicleUpdates now checks in checked and debug config that the odd wheels of a PxVehicleDriveTank are either all to the left or all to the right of their even-wheeled complement, as specified in PxVehicleDriveTankWheelOrder.  A warning is issued if the check fails.</li>
-			<li>To improve api consistency the arguments of the function PxVehicleDriveDynData::setAnalogInput(const PxReal analogVal, const PxU32 type) have been swapped so that it is now of the form PxVehicleDriveDynData::setAnalogInput(const PxU32 type, const PxReal analogVal).</li>
-			<li>Non-persistent wheel dynamics data (slip, friction, suspension force, hit data etc) has been moved out of the PxVehicleWheelsDynData class and is now recorded in a PxWheelQueryResult buffer that is passed as a function argument to the PxVehicleUpdates function.</li>
-			<li>PxVehicleWheels::isInAir() has been replaced with PxVehicleIsInAir(const PxVehicleWheelQueryResult& vehWheelQueryResults) to reflect the change to non-persistent data storage.</li>
-			<li>The origin of vehicles can now be shifted to stay in sync when the origin of the underlying PxScene is shifted. See PxVehicleShiftOrigin() for details.</li>
-			<li>PxVehicleWheels::setWheelShapeMapping and PxVehicleWheels::getWheelShapeMapping have been moved to PxVehicleWheelsSimData::setWheelShapeMapping and PxVehicleWheelsSimData::getWheelShapeMapping</li>
-			<li>PxVehicleWheels::setSceneQueryFilterData and PxVehicleWheels::getSceneQueryFilterData have been moved to PxVehicleWheelsSimData::setSceneQueryFilterData and PxVehicleWheelsSimData::getSceneQueryFilterData</li>
-			<li>PxVehicle4WEnable3WTadpoleMode and PxVehicle4WEnable3WDeltaMode now take an extra function argument: a non-const reference to a PxVehicleWheelsDynData.</li>
-			<li>The section "SI Units" in the vehicle guide has been updated to include the new functon PxVehicleWheelsSimData::setMinLongSlipDenominator.</li>
-			<li>PxVehicleTireData::mCamberStiffness has been replaced with PxVehicleTireData::mCamberStiffnessPerUnitGravity.  PxVehicleTireData::mCamberStiffnessPerUnitGravity should be set so that it is equivalent to the old value of PxVehicleTireData::mCamberStiffness divided by the magnitude of gravitational acceleration.</li>
-			<li>PxVehicleComputeTireForceDefault has been removed from the public vehicle api. Custom tire shaders that call PxVehicleComputeTireForceDefault are best implemented by taking a copy of PxVehicleComputeTireForceDefault and calling the copy instead.</li>
+			<li>Microsoft Windows XP or later</li>
+			<li>Microsoft Visual Studio 2008, 2010, 2012 (Windows RT/PS4/XboxOne only)</li>
+			<li>Xcode 4.6</li>
 		</ul>
-		<li>Fixed:</li>
+	</blockquote>
+
+	<h2>Known Issues</h2>
+	<blockquote>
 		<ul>
-			<li>Sticky tire friction is now activated in the tire's lateral direction at the tire force application point when the velocity at the base of the tire has low longitudinal and low lateral components. Longitudinal sticky tire friction is unaffected and is still activated when the vehicle has low forward speed. This fixes a minor bug where vehicles positioned on a slope perpendicular to the slope's downwards direction can slowly drift down the slope.</li>
-			<li>Bugs in the suspension force and tire load computation have been fixed that affected handling when the car was upside down.</li>
-			<li>The tire load passed to the tire force computation is now clamped so that it never falls below zero.</li>
-			<li>A bug in the tank damping forces has now been fixed.  Tanks now slow down more aggressively from engine and wheel damping forces.</li>
+			<li>PxSimulationOrder::eSOLVE_COLLIDE feature is not implemented in this release.  Calls to PxScene::solve() and PxScene::collide() will be ignored with a warning added to the error stream</li>
+			<li>Reading back the kinematic target through PxRigidDynamic::getKinematicTarget() in the PxSimulationEventCallback::onContact() callback will fail.</li>
+			<li>Cloth self-collision without rest position is not supported on GPUs with SM capability lower than 2.0.</li>
 		</ul>
-	</ul>
+	</blockquote>
+	<h2>
+		Changes and Resolved Issues</h3>
+		<blockquote>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<i>Note: Platform specific issues and changes can be found in the readme file of the corresponding platform.</i>
 
+			<h3>General</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>PxScene::setLimits: Hint to preallocate capacities of various data structures. See PxSceneLimits.</li>
+					<li>Scalar constructors PxQuat(r), PxMat33(r), PxMat33(r).</li>
+					<li>identity constructors for PxMat33, PxMat44, PxQuat, PxTransform - e.g. PxTransform(PxIdentity).</li>
+					<li>zero constructors for PxMat33, PxMat44, PxVec3 - e.g. PxMat33(PxZero).</li>
+					<li>PxTransform(x, y, z) constructor was added as a shortcut for PxTransform(PxVec3(x,y,z)).</li>
+					<li>The GPU code uses CUDA version 5.0 and supports Kepler GPUs (SM version 3.0 and 3.5).
+					<li>Helper method PxContactPair::bufferContacts() has been added to copy the contact pair data stream into a user buffer.</li>
+					<li>PxGeometryQuery::computePenetration() has been added, to compute the Minimum Translational Distance between geometry objects.</li>
+					<li>Ability for APEX and other PhysX extensions to change the PhysX Visual Indicator.</li>
+					<li>Reporting allocation names can now be enabled or disabled (see PxFoundation::setReportAllocationNames). When enabled, some platforms allocate memory through 'malloc'.</li>
+					<li>The scene origin can now be shifted to better support big world scenarios. See PxScene::shiftOrigin() for details.</li>
+					<li>PxAssertHandler got extended with a boolean parameter to support ignoring specific asserts. See windows implementation of DefaultAssertHandler.</li>
+					<li>Added new PxPairFlags - PxPairFlag::eSOLVE_CONTACT and PxPairFlag::eDETECT_DISCRETE_CONTACT.</li>
+				</ul>
+				<li>Removed:</li>
+				<ul>
+					<li>The obsolete PxConvexFlag::eUSE_UNCOMPRESSED_NORMALS flag has been removed.</li>
+					<li>The PxSceneFlag::eDISABLE_SSE was obsolete, and has now been removed.</li>
+					<li>The obsolte PxPtrArray has been removed</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>Mesh BVH construction was significantly improved for meshes with a mix of large and small triangles. Mesh sizes are now slightly increased, but traversals are substantially faster. As a side effect, cooked mesh format has changed. This requires meshes to be recooked!</li>
+					<li>The specification for valid PxBounds3 representations has changed. See the API documentation for details (especially the newly introduced PxBounds3::isValid() method).</li>
+					<li>PxBounds3::transform(), ::scale() and ::fatten() have been split into a fast and a safe version to avoid unnecessary checks and improve stability for empty bounds respectively.</li>
+					<li>PxBounds3::setInfinite() has been renamed to PxBounds3::setMaximal() to better fit the actual behavior.</li>
+					<li>PxTask: PVD profile events for tasks are now only emitted in profile builds (all platforms).</li>
+					<li>Platform mutex implementations now verify that lock/unlock come from correct thread in debug builds.</li>
+					<li>PxWindowsDelayLoadHook.h has been moved from Include/foundation/windows to Include/common/windows.</li>
+					<li>API instances of 'Num' as in 'maxNum' and 'getNum' have changed uniformly to 'Nb'.</li>
+					<li>
+						The following classes have been renamed:
+						<ul>
+							<li>PxSceneQueryHit to PxQueryHit</li>
+							<li>PxSceneQueryFlags to PxHitFlags</li>
+							<li>PxSceneQueryHitType to PxQueryHitType</li>
+							<li>PxSceneQueryFilterData to PxQueryFilterData</li>
+							<li>PxSceneQueryFilterCallback to PxQueryFilterCallback</li>
+							<li>PxSceneQueryFilterFlags to PxQueryFlags</li>
+							<li>PxSceneQueryCache to PxQueryCache</li>
+							<li>PxCCTNonWalkableMode to PxControllerNonWalkableMode</li>
+							<li>PxControllerFlags to PxControllerCollisionFlags</li>
+							<li>PxCCTHit to PxControllerHit</li>
+							<li>PxConstraintDominance to PxDominanceGroupPair</li>
+							<li>PxActorTypeSelectionFlags to PxActorTypeFlags</li>
+							<li>PxFindOverlapTriangleMeshUtil to PxMeshOverlapUtil</li>
+						</ul>
+						The previous names have been retained for compatibility but are deprecated.
+					</li>
+					<li>PX_SLEEP_INTERVAL has been replaced with the new parameter PxSceneDesc::wakeCounterResetValue to specify the wake counter value to set when wakeUp() gets called on dynamic objects.</li>
+					<li>PxClientBehaviorBit has been renamed PxClientBehaviorFlag, PxActorClientBehaviorBit has been renamed PxActorClientBehaviorFlag. Names of related functions have also changed.</li>
+					<li>queryClient parameter in raycast(), sweep(), overlap() functions was moved inside of PxQueryFilterData struct.</li>
+					<li>The PxObserver/PxObservable system has been replaced by the PxDeletionListener API. The supported object types have been extended from PxActor to all core objects inheriting from PxBase. Furthermore, two kinds of deletion events are now distinguished: user release and memory release. Please read the API documentation for details.</li>
+					<li>Deprecated PxPairFlag::eRESOLVE_CONTACT. Use PxPairFlag::eDETECT_DISCRETE_CONTACT and PxPairFlag::eSOLVE_CONTACT instead.</li>
+					<li>Number of materials per shape is now PxU16 instead of PxU32, contact material information also now returns PxU16.</li>
+					<li>Maximum number of touching hits in batched queries is now PxU16 instead of PxU32.</li>
+					<li>SweepEpsilonDistance has been replaced by meshContactMargin and marked as deprecated. Please read the API documentation for details.</li>
+					<li>PxShape::resetFiltering() and PxParticleBase::resetFiltering() have been deprecated. Please use one of the new overloaded methods PxScene::resetFiltering() instead.</li>
+					<li>The pxtask namespace has been removed and it's types have been added to the physx namespace with a Px* prefix</li>
+					<li>The delay load hook PxDelayLoadHook::setPhysXInstance has been renamed to PxSetPhysXDelayLoadHook and PxDelayLoadHook::setPhysXCookingInstance has been renamed to PxSetPhysXCookingDelayLoadHook</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Calling Thread::setAffinityMask() before the thread has been started is now supported.</li>
+					<li>PxDefaultSimulationFilterShader ignored the value of the second filter constant in the collision filtering equation and used the value of the first filter constant instead.</li>
+				</ul>
+				<li>Deprecated:</li>
+				<ul>
+					<li>The PxScene::flush() method has been deprecated, please use PxScene::flushSimulation().</li>
+					<li>PxRigidDynamicFlag has been deprecated and replaced with PxRigidBodyFlag to allow flags to be shared between PxArticulationLink and PxRigidDynamic.</li>
+					<li>PxTransform::createIdentity(), PxQuat::createIdentity(), PxMat33::createIdentity(), PxMat44::createIdentity(), PxMat33::createZero(), PxMat44::createZero()</li>
+				</ul>
+			</ul>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.4</h1>
+			<h3>Character controller</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>The PxControllerBehaviorFlag::eCCT_USER_DEFINED_RIDE flag has been added.</li>
+					<li>A new helper function PxController::resize() has been added to facilitate character controller resizing.</li>
+					<li>A new runtime tessellation feature has been added that can help reducing FPU accuracy issues in the sweep tests.</li>
+					<li>PxControllerFilterCallback has been added to make CCT-vs-CCT filtering more flexible.</li>
+					<li>An overlap recovery module has been added. See PxControllerManager::setOverlapRecoveryModule</li>
+					<li>PxObstacle notifications has been added to handle touched obstacles.</li>
+					<li>PxObstacleContext::getObstacleByHandle has been added.</li>
+					<li>The origin of character controllers and obstacles can now be shifted to stay in sync when the origin of the underlying PxScene is shifted. See PxControllerManager::shiftOrigin() for details.</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>The PxControllerManager is now tightly coupled to a PxScene which has to be provided on creation. See PxCreateControllerManager().</li>
+					<li>The PxObstacleContext instances of a PxControllerManager will now get released automatically when the manager is released.</li>
+					<li>PxController::reportSceneChanged() has been renamed to PxController::invalidateCache().</li>
+					<li>PxControllerBehaviorFlag::eCCT_CAN_RIDE_ON_OBJECT is not the default behavior anymore.</li>
+					<li>PxCCTNonWalkableMode::eFORCE_SLIDING has been renamed to PxCCTNonWalkableMode::ePREVENT_CLIMBING_AND_FORCE_SLIDING.</li>
+					<li>PxCCTInteractionMode has been renamed PxControllerInteractionMode</li>
+					<li>PxCCTNonWalkableMode has been renamed PxControllerNonWalkableMode</li>
+					<li>PxCCTHit has been renamed PxControllerHit</li>
+					<li>Touched PxObstacle has been replaced by touched ObstacleHandle.</li>
+					<li>PxControllerInteractionMode and PxControllerFilters::mActiveGroups have been removed. Please use the new PxControllerFilterCallback instead.</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Bugs with respect to deleting shapes from a touching actor have been fixed.</li>
+					<li>Touched actor's scene is checked in CCT::move before rideOnTouchedObject is called to ensure that the touched shape is valid.</li>
+					<li>Touched shape's scene query flag is checked in CCT::move before rideOnTouchedObject is called to ensure that the touched shape is valid.</li>
+					<li>Touched shape's user CCT filtering is triggered in CCT::move before rideOnTouchedObject is called to ensure that the touched shape is valid.</li>
+					<li>The PxControllerBehaviorCallback was not called when a PxUserControllerHitReport was not defined.</li>
+					<li>It is not possible anymore to create a CCT whose contact offset is zero. The doc has always said a non-zero value was expected, but corresponding check was missing.</li>
+					<li>Box & capsule controllers' resize function now properly take the up direction into account</li>
+				</ul>
+			</ul>
 
-<h2 style="TEXT-ALIGN: center">April 2013</h2>
+			<h3>CCD</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>Introduced PxRigidBodyFlag::eENABLE_CCD_FRICTION to control whether friction is applied inside CCD on a given body. This is disabled by default. In general, disabling friction in CCD improves the behavior of high-speed collisions.</li>
+					<li>Introduced PxSceneDesc::ccdMaxPasses field, which controls the maximum number of CCD passes. Each CCD pass will advance each object to its next TOI. By default, we use 1 pass. Increasing the number of passes can reduced the likelihood of time being dropped inside the CCD system at the cost of extra CCD procesing overhead.</li>
+					<li>Introduced per-body value to control how far past the initial time of impact the CCD advances the simulation. Advancing further can improve fluidity at the increased risk of tunneling.</li>
+					<li>CCD now has limited support contact modification. It supports disabling response to contacts by setting max impulse to 0. It does not yet support target velocity, non-zero max impulse values or scaling mass or inertia.</li>
+					<li>Introduced new PxPairFlag eDETECT_CCD_CONTACT. This flags is used to control whether CCD performs sweep tests for a give pair. Decision over whether any collisions are responded to is made by the presence of the flag eSOLVE_CONTACT.</li>
+				</ul>
+				<li>Removed:</li>
+				<ul>
+					<li>CCD is now enabled per-body instead of per-shape. As a result, PxShapeFlag::eUSE_SWEPT_BOUNDS has been removed and replaced with PxRigidBodyFlag::eENABLE_CCD.</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>API attributes previously named SWEPT_INTEGRATION have now been renamed 'ccd': specifically PxSceneFlag::eENABLE_SWEPT_INTEGRATION, PxSceneFlag::eSWEPT_INTEGRATION_LINEAR, PxSceneDesc::sweptIntegrationLinearSpeedFactor, PxSceneDesc::sweptIntegrationAngularSpeedFactor, PxPairFlag::eENABLE_SWEPT_INTEGRATION</li>
+					<li>PxPairFlag::eCCD_LINEAR has been deprecated. Use (PxPairFlag::eDETECT_CCD_CONTACT | PxPairFlag::eSOLVE_CONTACT) instead.</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Updated CCD algorithm which improves the fluidity of motion of the CCD system while reducing the processing overhead significantly.</li>
+					<li>Contact notification is now reliable in CCD. CCD contacts are appended to the end of the contact list for a given pair so that both discrete and continuous contacts are reported</li>
+					<li>CCD contact notification now reports applied forces. These contacts will be correctly filtered by force thresholds.</li>
+				</ul>
+			</ul>
 
-<h2>What's New In NVIDIA PhysX 3.2.4</h2>
+			<h3>Serialization</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>Added extension PxSerialization::isSerializable method to query whether a collection is serializable.</li>
+					<li>Added extension PxSerialization::complete which allows to prepare a collection for serialization.</li>
+					<li>Added extension PxSerialization::createNames which adds reference names to serializables.</li>
+					<li>Added extension PxCollectionExt::remove which removes all serializables of a certain type and optionally adds them to another collection.</li>
+					<li>Added extension function PxCollectionExt::releaseObjects to remove and release objects from a collection</li>
+					<li>Added class PxSerializationRegistry for handling custom serializable types.</li>
+					<li>Added PxSerialization::serializeCollectionToXml and PxSerialization::createCollectionFromXml</li>
+					<li>Included pre-built binary meta data with SDK at [path to installed PhysX SDK]/Tools/BinaryMetaData</li>
+					<li>Added class PxSerializer to provide serialization specific functionality to serializable classes.</li>
+					<li>Added classes PxSerializationContext and PxDeserializationContext to provide functionality for serialization and deserialization operations.</li>
+				</ul>
+				<li>Removed:</li>
+				<ul>
+					<li>Removed PxUserReferences class and PxPhysics::createUserReferences.</li>
+					<li>Removed RepX.h and unified serialization interfaces for xml and binary serialization.</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>PxCollection was reworked to improve reference management rendering PxUserReferences obsolete. PxCollection was also decoupled more from Serialization/Deserialization functionality. Serialization of incomplete collections fails now early at serialization as opposed to late at deserialization.</li>
+					<li>Replaced PxPhysics::createCollection with PxCreateCollection.
+					<li>
+					<li>Replaced RepXCollection with PxCollection, and unified corresponding interfaces.</li>
+					<li>Replaced PxCollection::serialize with PxSerialization::serializeCollectionToBinary.</li>
+					<li>Replaced PxCollection::deserialize with PxSerialization::createCollectionFromBinary.</li>
+					<li>Replaced PxSerializable::collectForExport(PxCollection& c) with PxSerializer::requires. The new method works in a non-recursive way.</li>
+					<li>Replaced PxDumpMetaData with PxSerialization::dumpMetaData.</li>
+					<li>Replaced PxCollectForExportSDK with PxCollectionExt::createCollection(PxPhysics& sdk).</li>
+					<li>Replaced PxCollectForExportScene with PxCollectionExt::createCollection(PxScene& scene).</li>
+					<li>Moved PxCooking::createBinaryConverter to PxSerialization</li>
+					<li>Changed PxShape release semantics for shapes. As a consequence deserialized shapes are never autmatically released, but need to be released by the application. Exception: PxPhysics::release.</li>
+				</ul>
+			</ul>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
-		<li>Fixed a bug which caused actors to return wrong world bounds if the bounds minimum was above 10000 on any axis.</li>
-		<li>Reporting allocation names can now be enabled or disabled (see PxFoundation::setReportAllocationNames). When enabled, some platforms allocate memory through 'malloc'.</li>
-		<li>eEXCEPTION_ON_STARTUP is removed from PxErrorCode and it is no longer needed.</li>
-		<li>Added boilerplate.txt to the Tools folder. SpuShaderDump.exe and clang.exe require it.</li>
-		<li>PxWindowsDelayLoadHook.h has been moved from Include/foundation/windows to Include/common/windows.</li>
-		<li>PxScene::saveToDesc now reports the bounceThresholdVelocity value.</li>
-		<li>Fixed a bug in PxDefaultSimulationFilterShader: the value of the second filter constant in the collision filtering equation was ignored and instead the value of the first filter constant was used.</li>
-		<li>Fixed a crash bug in PCM collision.</li>
-	</ul>
+			<h3>Cloth</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>Improved GPU cloth performance significantly with new parallel solver.</li>
+					<li>Added tether constraints, which allow minimum amount of cloth stretching even for large gravity scales. See PxClothFabric.</li>
+					<li>Added support for dynamic addition and deletion of collision primitives.  See PxCloth::addCollision* and PxCloth::removeCollision*.</li>
+					<li>Added triangle mesh collider support. See PxCloth::setCollisionTriangles. </li>
+					<li>Added support for self collision and inter-cloth collision. See PxCloth::setSelfCollision*() and PxScene::setClothInterCollision*().</li>
+					<li>Added direct access to CUDA particle data for graphics interoperability, see PxCloth::lockParticleData()</li>
+					<li>Added PxRegisterCloth to reduce binary size by stripping unused code on platforms where static linking is used.</li>
+					<li>Added methods setWakeCounter/getWakeCounter() (see the corresponding API documentation for details).</li>
+					<ul>
+						<li>It is illegal to call wakeUp/putToSleep/isSleeping() on a PxCloth that has not been added to a scene.</li>
+					</ul>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>Cloth solver does not use fibers any more. See PxClothFabric for changes in fabric API.</li>
+					<li>Moved PxCooking.cookClothFabric() to extensions. See PxClothFabricCooker and PxClothFabricCreate.</li>
+					<li>PxClothMeshDesc has been moved to extensions and now supports both triangle and quad representations.  See PxClothMeshDesc.</li>
+					<li>The scaling of damping and stiffness coefficients has been separated from the solver frequency and can now be set indepedently using PxCloth::setStiffnessFrequency().</li>
+					<li>PxCloth::setInertiaScale() has been split into linear, angular, and centrifugal components.  See PxCloth::set*IntertiaScale.</li>
+					<li>Drag coefficient has been split into linear and angular coefficient. See PxCloth::setLinearDragCoefficient and PxCloth::setAngularDragCoefficient.</li>
+					<li>Renamed PxCloth::lockClothReadData() to lockParticleData().  Added support for update operations on the returned particle arrays (as an alternative to setParticles()).</li>
+					<li>PxCloth::wakeUp() does not have a parameter anymore. Use setWakeCounter() instead to set a specific value.</li>
+					<li>PxCloth::getNbCollisionSpherePairs() has been renamed to PxCloth::getNbCollisionCapsules()</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Fixed a crash bug in the clothing collision code appearing on iOS.</li>
+				</ul>
+			</ul>
 
+			<h3>Rigid Bodies</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>The contact distance parameter for a limit is automatically estimated if not supplied in the constructor for the limit structure.</li>
+					<li>The new callback PxConstraintConnector::onOriginShift() has been introduced. It gets called for all constraints of a scene, when its origin is shifted.</li>
+					<li>New helper function PxRigidBodyExt::computeVelocityDeltaFromImpulse has been added.</li>
+					<li>Shapes may be declared as shared on creation, and then attached to multiple actors, see the user manual for restrictions.</li>
+					<li>
+						Since a shape is no longer necessarily associated with a unique actor, references to shapes in callbacks from the engine are accompanied by the
+						a reference to the associated actor
+					</li>
+					<li>Joints and contact modification now support tuning relative mass and inertia for the bodies on a per-contact basis. Inertia and mass can be tuned independently.</li>
+				</ul>
+				<li>Removed:</li>
+				<ul>
+					<li>PxShape::overlap(), PxShape::sweep() and PxShape::raycast() have been removed. Equivalent functionality is provided in PxGeometryQuery.</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>It is illegal to call resetFiltering() on a PxShape or PxParticleBase if they have not been added to a scene.</li>
+					<li>It is illegal to call addForce/addTorque/clearForce/clearTorque() on a PxRigidBody that has not been added to a scene.</li>
+					<li>
+						The sleep behavior of dynamic rigid bodies has changed significantly (for details on the current behavior see the API documentation of isSleeping(), wakeUp(), putToSleep(), setKinematicTarget(), PxRigidBodyFlag::eKINEMATIC, ...). Among the changes are:
+						<ul>
+							<li>The methods setWakeCounter/getWakeCounter() have been added for PxRigidDynamic and PxArticulation objects (see the corresponding API documentation for details).</li>
+							<li>The wakeUp() method of PxRigidDynamic and PxArticulation has lost the wake counter parameter. Use setWakeCounter() instead to set a specific value.</li>
+							<li>It is illegal to call wakeUp/putToSleep/isSleeping() on a PxRigidDynamic or PxArticulation that has not been added to a scene.</li>
+							<li>Putting a dynamic rigid actor to sleep will clear any pending force updates.</li>
+							<li>Switching a dynamic actor to kinematic will put the actor to sleep immediately.</li>
+							<li>Switching a kinematic actor back to dynamic will not affect the sleep state (previously the actor was woken up).</li>
+							<li>Calling wakeUp/putToSleep() on a kinematically controlled dynamic actor is not valid any longer. The sleep state of a kinematic actor is solely defined based on whether a target pose has been set (see API documentation of isSleeping() for details).</li>
+							<li>A call to PxRigidBody::setCMassLocalPose() does not wake up the actor anymore. Note: this also affects related methods in PhysXExtensions like PxRigidBodyExt::updateMassAndInertia() etc.</li>
+							<li>If a non-zero velocity or force is set through PxRigidBody::setLinearVelocity(), ::setAngularVelocity(), ::addForce() or ::addTorque(), the actor will get woken up automatically even if the autowake parameter is false.</li>
+							<li>PxRigidBody::clearForce() and ::clearTorque() do not have the autowake parameter, to optionally wake the actor up, anymore. These methods will not change the sleep state any longer. Call ::wakeUp() subsequently to get the old default behavior.</li>
+							<li>Adding or removing a PxConstraint/PxJoint to/from the scene does not wake the connected actors up automatically anymore.</li>
+							<li>It is now possible to avoid automatic wake up of previously touching objects on scene removal. See the additional parameter wakeOnLostTouch in PxScene::removeActor(), ::removeArticulation(), ::removeAggregte(), PxRigidActor::detachShape().</li>
+						</ul>
+					</li>
+					<li>PxJointLimit and PxJointLimitPair are now PxJointLinearLimit, PxJointLinearLimitPair, PxJointAngularLimitPair, depending on whether the limit is linear or angular.</li>
+					<li>Joints now solve for the entire position error rather than a ratio of 0.7 of it. The flag PxConstraintFlag::eDEPRECATED_32_COMPATIBILITY can be used to restore this behavior</li>
+					<li>PxConstraintFlag::Type has been renamed to PxConstraintFlag::Enum
+					<li>
+					<li>The spring constant parameter in joints and articulations that was previously 'spring' is now 'stiffness'.</li>
+					<li>The tangential spring constant parameter in articulations that was previously 'tangentialSpring' is now 'tangentialStiffness'.</li>
+					<li>Constraints do not respect PxDominanceGroup settings. Use PxJoint::setInvMassScale and setInvInertiaScale</li>
+					<li>Shapes are reference counted. PxShape::release() now decrements the reference count on a shape, and its use is deprecated for detaching a shape from its actor - use detachShape() instead.</li>
+					<li>Shape creation methods do not take a local transform parameter anymore. Instead PxShapeFlags can be specified. Triangle meshes, height fields and plane geometry shapes cannot be combined with non-kinematic PxRigidDynmic actors if PxShapeFlag::eSIMULATION_SHAPE is specified. Corresponding calls to PxRigidActor::createShape() or PxRigidActor::attachShape() are not supported.</li>
+					<li>PxShape::getActor() now returns a pointer, which is NULL if the shape is shareable.</li>
+					<li>PxShape::getWorldBounds() has been replaced with PxShapeExt::getWorldBounds().</li>
+					<li>PxContactPoint has been renamed PxFeatureContact.</li>
+					<li>The internal format for contact storage has been modified; applications directly accessing the internal contact representation rather than PxContactPair::extractContacts should be modified accordingly.</li>
+					<li>Friction mode flags eENABLE_ONE_DIRECTIONAL_FRICTION and eENABLE_TWO_DIRECTIONAL_FRICTION have been replaced by PxFrictionType::Enum PxSceneDesc::frictionType.</li>
+					<li>PxSceneDesc::contactCorrelationDistance has been deprecated.</li>
+					<li>PxSceneDesc::contactCorrelationDistance no longer has an influence on how many friction anchors are created in a single frame, only on when they are removed in later frames.  This may cause a very minor change in friction behavior.</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Rigid bodies now properly accumulate the forces/torques that are applied with addForce/addTorque while scene insertion is still pending.  This affects bodies added to the scene while the scene is simulating and then given forces/torques with addForce/addTorque while the scene is simulating.  These accumulated forces and torques are applied during the next simulate() call.  Prior to this fix the forces/torques that accumulated while scene insertion was pending were lost and never applied.</li>
+					<li>Its now possible to serialize collections with jointed actors without including the corresponding joints in the collection. The deserialized actors will not be jointed anymore.</li>
+					<li>Joint drive force limits are actual force limits rather than impulse limits. Set the flag PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES to false to support legacy behavior</li>
+					<li>Angular drive constraints for revolute joints were wrongly computed, resulting in a negative angular velocity when a positive drive target was set.</li>
+					<li>Scene addActors will now correctly remove all actors that were passed to add, if some insert failed.</li>
+					<li>Contact modification now enforces max impulse field correctly. Previously, it only enforced it if max impulse was set to 0.</li>
+					<li>Contact modification now supports target velocity in all directions. Previously, it only enforced the target velocity components that were perpendicular to the contact normal.</li>
+					<li>Jittering of small spheres & capsules on very large triangles has been fixed.</li>
+					<li>Setting sleep threshold to 0 now guarantees that bodies won't fall asleep even if their kinetic energy reaches exactly 0.</li>
+				</ul>
+				<li>Deprecated:</li>
+				<ul>
+					<li>PxShape::release() now decrements the reference count on a shape, and its use is deprecated for detaching a shape from its actor - use detachShape() instead.</li>
+					<li>PxJoint::getType() is deprecated - use getConcreteType() instead.</li>
+					<li>PxConstraintFlag::eREPORTING is deprecated - constraints always generate force reports</li>
+					<li>PxConstraintDominance is deprecated - use PxDominanceGroupPair instead.</li>
+				</ul>
+			</ul>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-	    <li>Forces applied to bodies (with PxRigidBody::addForce) that go to sleep in the subsequent update now have their applied forces cleared when the body is set to sleep to avoid them being applied in a later update when the body is once more awake.  This bug broke the rule that forces applied with PxRigidBody::addForce do not persist beyond the next scene update.</li>
-		<li>Jittering of small spheres & capsules on very large triangles has been fixed.</li>
-    </ul>
+			<h3>Scene queries</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>PxVolumeCache, a volumetric cache for local collision geometry.</li>
+					<li>Parameter inflation was added to some PxGeometryQuery functions.</li>
+					<li>Flag eMESH_BOTH_SIDES can be now used to control triangle mesh culling for raycasts and sweeps.</li>
+					<li>Added PxBatchQueryMemory as a part of PxBatchQueryDesc, to allow memory buffers to be set before execution.</li>
+					<li>PxBatchQueryDesc() constructor now requires 3 parameters at initialization - see migration guide for more details.</li>
+					<li>PxBatchQuery::raycast, sweep, overlap calls will now issue a warning and discard the query when over maximum allowed queries.</li>
+					<li>There is a new flag to allow manually updating the scene query representation, see: PxSceneFlags::eENABLE_MANUAL_SQ_UPDATE and PxScene::flushQueryChanges().</li>
+					<li>Added PxScene::forceDynamicTreeRebuild() function to immediately rebuild the scene query structures.</li>
+					<li>Added bool PxSweepHit::hadInitialOverlap() returning true if a sweep hit occurred early due to initial overlap at distance=0.</li>
+				</ul>
+				<li>Removed:</li>
+				<ul>
+					<li>PxBatchQuery::linearCompoundGeometrySweepSingle and PxBatchQuery::linearCompoundGeometrySweepMultiple functions are no longer supported.</li>
+					<li>Globals PxPS3ConfigParam::eSPU_OVERLAP, eSPU_RAYCAST, eSPU_SWEEP that were previous set via setSceneParamInt call are replaced with PxBatchQueryDesc::runOnSpu. See migration guide for more details.</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>Scene Query raycastAny/Single/Multiple APIs were merged into a single raycast() call (same for overlaps and sweeps). Please refer to user and migration guides for details.</li>
+					<li>Scene Query raycast() API now uses a PxRaycastBuffer or a PxRaycastCallback parameter for reporting hits. Blocking hits are now reported separately from toching and PxRaycastCallback class supports reporting an unbounded number of results (same for overlaps and sweeps).</li>
+					<li>A const templated PxRaycastBufferN<int> object was added to allow convenient creation of fixed size scene query touch buffers. Same for overlaps and sweeps.</li>
+					<li>Support for compound sweeps was moved out from core SDK to extensions.</li>
+					<li>Support for compound sweeps was moved from PxScene to extensions (see PxRigidBodyExt::linearSweepSingle, PxRigidBodyExt::linearSweepMultiple).</li>
+					<li>PxQueryFilterCallback::preFilter now passes an actor pointer as well as a shape pointer.</li>
+					<li>PxSceneQueryFlag::eINITIAL_OVERLAP and PxSceneQueryFlag::eINITIAL_OVERLAP_KEEP have been replaced with PxHitFlag::eINITIAL_OVERLAP_DISABLE and PxLocationHit::hadInitialOverlap(). Note that checking for initial overlap is now the defaut for sweeps.</li>
+					<li>Sweeps in 3.3 execute using a new faster code path, in some cases with reduced precision. If you encounter precision issues not previously experienced in earlier versions of PhysX, use ePRECISE_SWEEP flag to enable the backwards compatible more accurate sweep code.</li>
+					<li>The default behavior for overlap queries with query filters returning eBLOCK has changed to only return one of eBLOCK hits. Please refer to the migration guide for details.</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Scene Query performance was significantly improved in a variety of scenarios.</li>
+					<li>Fixed a bug in capsule/mesh overlap code that occasionally caused unreported and misreported faces.</li>
+					<li>Fixed a crash in raycastMultiple when a convex was hit by the ray and eMESH_MULTIPLE flag was specified as a query flag.</li>
+					<li>Fixed a rare crash in heightfield raycast code.</li>
+					<li>Internal limit of 65536 results has been removed.</li>
+					<li>Accuracy in sphere/capsule-vs-triangle overlap and sweep tests has been improved.</li>
+				</ul>
+			</ul>
 
-	<h4>Cooking</h4>
-	<ul>
-		<li>PxCookingParams constructor is now marked as deprecated. PxToleranceScale is needed for PxCookingParams in order to perform additional triangle check during cooking. This triangle check will trigger warning if too huge triangles are used. This check will ensure better simulation stability.</li>
-	</ul>
+			<h3>Cooking</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>Added support for convex hull creation with limited output vertices count.</li>
+					<li>Added support for convex hull creation directly from polygons rather than triangles.</li>
+					<li>Added support function computeHullPolygons in cooking, that creates hull polygons from given vertices and triangles. The resulting polygons can be used to create the convex hull directly.</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>Changed convex hull volume integrals.</li>
+					<li>PxCookingParams constructor requires now PxTolerancesScale as an additional parameter. This enables us to perform further checks on the triangles during cooking. A warning will be emitted to the error stream if too huge triangles were found. This will ensure better simulation stability.</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Optimized heightfield load code for no-endian conversion case.</li>
+				</ul>
+			</ul>
 
-	<h4>Scene Queries</h4>
-	<ul>
-		<li>Added PxScene::forceDynamicTreeRebuild() function to immediately rebuild the scene query structures.</li>
-		<li>Accuracy in sphere/capsule-vs-triangle overlap and sweep tests has been improved.</li>
-	</ul>
+			<h3>Triangle meshes</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>Added PxTriangleMeshFlag::eHAS_ADJACENCY_INFO flag for adjacency information checks.</li>
+				</ul>
+				<li>Removed:</li>
+				<ul>
+					<li>Removed has16BitTriangleIndices(), replaced by triangleMesh->getTriangleMeshFlags() & PxTriangleMeshFlag::eHAS_16BIT_TRIANGLE_INDICES.</li>
+				</ul>
+			</ul>
 
-	<h4>Broad Phase</h4>
-	<ul>
-		<li>Fixed assert in debug mode that wrongly asserted when an overlap pair was removed with perfect equality between the min of one bound and the max of the other along at least one axis.</li>
-	</ul>
+			<h3>Particles</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>Direct read access to CUDA particle data has been added for graphics interoperability, see PxParticleBase::lockParticleReadData(PxDataAccessFlags flags) and PxParticleFluid::lockParticleFluidReadData(PxDataAccessFlags flags)</li>
+					<li>Added PxRegisterParticles to reduce binary size by stipping unused code on platforms where static linking is used.</li>
+					<li>Added setExplicitCudaFlushCountHint to allow early flushing of the cuda push buffer.</li>
+					<li>Added caching of triangle meshes. setTriangleMeshCacheSizeHint is supported on Kepler and above GPUs.</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Creating and immediately setting the position of particles failed and possibly crashed with GPU acceleration enabled. This would only happen after one or more simulation updates.</li>
+					<li>Creating many spread out particles might have crashed the GPU pipeline.</li>
+				</ul>
+			</ul>
 
-	<h4>Character controller</h4>
-	<ul>
-		<li>PxControllerFilterCallback has been added, to make CCT-vs-CCT filtering more flexible.</li>
-		<li>Fixed a bug where PxControllerBehaviorCallback was not called when a PxUserControllerHitReport was not defined.</li>
-		<li>PxObstacle notifications has been added to handle touched obstacles.</li>
-		<li>Touched PxObstacle has been replaced by touched ObstacleHandle.</li>
-		<li>PxObstacleContext::getObstacleByHandle has been added.</li>
-		<li>Touched actors scene is checked before ride on shape, to ensure valid touched shape.</li>
-		<li>Touched shape scene query flag is checked before ride on shape, to ensure valid touched shape.</li>
-		<li>Touched shape user CCT filtering is triggered before ride on shape, to ensure valid touched shape.</li>
-		<li>Box & capsule controllers' resize function now properly take the up direction into account</li>
-	</ul>
+			<h3>Broad Phase</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>The SDK now supports multiple broad-phase algorithms (SAP & MBP). See Release Highlights, PxBroadPhaseType and PxBroadPhaseDesc.</li>
+					<li>PxVisualizationParameter::eMBP_REGIONS has been added to visualize MBP regions</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>The sap broadphase now gracefully handles PxShape instances whose axis-aligned bounding boxes have min/max limits with value  +/- PX_MAX_F32 or QNAN or INF. Such bounds values can occur if a PxShape is given a global transform that is either illegal or close to the upper limit of the floating point range. Prior to this release, the sap broadphase could crash when the axis-aligned bounds of shapes had values that weren't within the floating point range.  This has now been fixed.  The overlap pairs reported by the broadphase for bounds with such values is undefined.</li>
+				</ul>
+			</ul>
 
-	<h4>Vehicles</h4>
-	<ul>
-		<li>Documented potential problem with PxVehicleWheelsDynData::getTireDrivableSurfaceType() and PxVehicleWheelsDynData::getTireDrivableSurfaceShape() whereby the pointer returned may reference a PxShape or PxMaterial that has been released in-between storing the pointer in PxVehicleUpdates and any subsequent query.</li>
-		<li>PxVehicleWheelsSimData::disableWheel has been documented in more detail.  PxVehicleWheelsSimData::enableWheel has been added.</li>
-		<li>Fixed a bug where the denominator of the longitudinal slip calculation didn't take into account the value of PxTolerancesScale::length.  This will only have an impact if PxTolerancesScale::length != 1.0f.</li>
-		<li>Fixed a bug where the engine torque would be incorrectly applied if PxTolerancesScale::length != 1.0f.  This will only have an impact if PxTolerancesScale::length != 1.0f.</li>
-	</ul>
+			<h3>Vehicles</h3>
+			<ul>
+				<li>Added:</li>
+				<ul>
+					<li>Vehicles now support serialization. A PxSerializationRegistry instance may be passed into PxInitVehicleSdk and PxCloseVehicleSdk in order to enable support.</li>
+					<li>Vehicle telemetry graphs can now be queried per channel for the most recent value with the function PxVehicleGraph::getLatestValue.</li>
+					<li>New vehicle PxVehicleDriveNW type has been introduced.  This class makes use of a new differential type PxVehicleDifferentialNW, which allows specified wheels to be equally coupled to the differential, and allows all for some or all of the N wheels to be driven. </li>
+					<li>Support for camber angles has been added to the PxVehicleSuspensionData class.</li>
+					<li>Moment of inertia parameter has been added to the PxVehicleEngineData class. Prior to this a value of 1.0 was assumed internally.  A default value of 1.0 has been used in the constructor for backwards compatability.</li>
+					<li>Vehicle manual contains new section describing the conversion of default vehicle parameter values from SI units to any system of units with particular reference to the use of centimetres instead of metres .</li>
+					<li>The SI units of each parameter in PxVehicleComponents.h has been documented. </li>
+					<li>Vehicle manual contains updated troubleshooting section.</li>
+					<li>The requirements for disabled wheels (PxVehicleWheelsSimData::disableWheel) have been documented to make it clear that disabled wheels must be no longer associated with a PxShape, must have zero rotation speed, and must be decoupled from the differential.  This is also now discussed in the guide.</li>
+					<li>Wheel raycasts documentation has been improved to clarify the start and end points of each raycast.</li>
+					<li>Suspension line raycasts do not need to be performed for each vehicle prior to update with PxVehicleUpdates.  This feature is implemented with a boolean array passed as an extra function argument to PxVehicleSuspensionRaycasts.  This feature is useful for vehicles that require only low level of detail.</li>
+					<li>The clutch model now supports two accuracy modes (PxVehicleClutchAccuracyMode::eESTIMATE and PxVehicleClutchAccuracyMode::eBEST_POSSIBLE).  If the estimate mode is chosen the computational cost and accuracy of the clutch can be tuned with PxVehicleClutchData::mEstimateIterations.</li>
+					<li>PxVehicleSuspensionData now has a function setMassAndPreserveNaturalFrequency.  This modifies the mass and stiffness of the spring in a way that preserves the spring natural frequency.</li>
+					<li>A new helper function PxVehicleCopyDynamicsData has been added that allows dynamics data such as engine rotation speed, wheel rotation speed, gear etc to be copied from one vehicle to another of the same type.  This is particularly useful if a vehicle has a number of different versions where each represents a different level of detail.</li>
+					<li>A new function PxVehicleWheelsSimData::copy has been added to allow per wheel dynamics data to be copied from one vehicle to another.</li>
+					<li>The vehicle manual contains a new section "Level of Detail" describing the available options for vehicles that require only a low level of detail.</li>
+					<li>PxVehicleTireLoadFilterData now requires that mMinNormalisedLoad is greater than or equal to zero.</li>
+					<li>PxVehicleTireLoadFilterData now has a new member variable mMinFilteredNormalisedLoad.  This value describes the filtered normalised load that occurs when the normalised is less than or equal to mMinNormalisedLoad.</li>
+					<li>PxVehicleWheelsSimData now has a new function setMinLongSlipDenominator.  This can be used to tune stability issues that can arise when the vehicle slows down in the absence of brake and drive torques.</li>
+					<li>A new section "PxVehicleAutoBoxData" has been added to the vehicle tuning guide to describe operation of the automatic gearbox.</li>
+					<li>A new section "The Vehicle Under-steers Then Over-steers" has been added to the vehicle troubleshooting guide to describe steps to avoid twitchy handling on bumpy surfaces.</li>
+					</li>A new section "The Vehicle Never Goes Beyond First Gear" has been added to the vehicle troubleshooting guide to describe a common scenario that occurs when the automatic gearbox is given a latency time that is shorter than the time taken to complete a gear change.</li>
+					<li>A new section "The Vehicle Slows Down Unnaturally" has been added to the vehicle troubleshooting guide to describe the steps that can be taken to help the vehicle slow down more smoothly.</li>
+					<li>A number of vehicle snippets have been added.</li>
+				</ul>
+				<li>Changed:</li>
+				<ul>
+					<li>Minor api change for consistency: PxVehicleDrive4WWheelOrder has been introduced to replace the enum beginning with PxVehicleDrive4W::eFRONT_LEFT_WHEEL.</li>
+					<li>Minor api change for consistency: PxVehicleDrive4WControl has been introduced to replace the enum beginning with PxVehicleDrive4WControl::eANALOG_INPUT_ACCEL.</li>
+					<li>Minor api change for consistency: PxVehicleDriveTankWheelOrder has been introduced to replace the enum beginning with PxVehicleDriveTankWheelOrder::eFRONT_LEFT.</li>
+					<li>Minor api change for consistency: PxVehicleDriveTankControl has been introduced to replace the enum beginning with PxVehicleDriveTank::eANALOG_INPUT_ACCEL.</li>
+					<li>Minor api change for consistency: PxVehicleDriveTankControlModel has been introduced to replace the enum beginning with PxVehicleDriveTank::eDRIVE_MODEL_STANDARD.</li>
+					<li>Minor api change for consistency: PxVehicleTypes has been introduced to replace the enum beginning with eVEHICLE_TYPE_DRIVE4W.</li>
+					<li>Minor api change for consistency: PxVehicleWheelGraphChannel has been introduced to replace the enum beginning with PxVehicleGraph::eCHANNEL_JOUNCE.</li>
+					<li>Minor api change for consistency: PxVehicleDriveGraphChannel has been introduced to replace the enum beginning with PxVehicleGraph::eCHANNEL_ENGINE_REVS.</li>
+					<li>Minor api change for consistency: PxVehicleGraphType has been introduced to replace the enum beginning with PxVehicleGraph::eGRAPH_TYPE_WHEEL.</li>
+					<li>PxVehicleUpdates now checks in checked and debug config that the wheel positioning of the driven wheels in a PxVehicleDrive4W obey the wheel ordering specified in PxVehicleDrive4WWheelOrder.  Vehicles that are back-to-front or right-to-left are also allowed. A warning is issued if the check fails.</li>
+					<li>PxVehicleUpdates now checks in checked and debug config that the odd wheels of a PxVehicleDriveTank are either all to the left or all to the right of their even-wheeled complement, as specified in PxVehicleDriveTankWheelOrder.  A warning is issued if the check fails.</li>
+					<li>To improve api consistency the arguments of the function PxVehicleDriveDynData::setAnalogInput(const PxReal analogVal, const PxU32 type) have been swapped so that it is now of the form PxVehicleDriveDynData::setAnalogInput(const PxU32 type, const PxReal analogVal).</li>
+					<li>Non-persistent wheel dynamics data (slip, friction, suspension force, hit data etc) has been moved out of the PxVehicleWheelsDynData class and is now recorded in a PxWheelQueryResult buffer that is passed as a function argument to the PxVehicleUpdates function.</li>
+					<li>PxVehicleWheels::isInAir() has been replaced with PxVehicleIsInAir(const PxVehicleWheelQueryResult& vehWheelQueryResults) to reflect the change to non-persistent data storage.</li>
+					<li>The origin of vehicles can now be shifted to stay in sync when the origin of the underlying PxScene is shifted. See PxVehicleShiftOrigin() for details.</li>
+					<li>PxVehicleWheels::setWheelShapeMapping and PxVehicleWheels::getWheelShapeMapping have been moved to PxVehicleWheelsSimData::setWheelShapeMapping and PxVehicleWheelsSimData::getWheelShapeMapping</li>
+					<li>PxVehicleWheels::setSceneQueryFilterData and PxVehicleWheels::getSceneQueryFilterData have been moved to PxVehicleWheelsSimData::setSceneQueryFilterData and PxVehicleWheelsSimData::getSceneQueryFilterData</li>
+					<li>PxVehicle4WEnable3WTadpoleMode and PxVehicle4WEnable3WDeltaMode now take an extra function argument: a non-const reference to a PxVehicleWheelsDynData.</li>
+					<li>The section "SI Units" in the vehicle guide has been updated to include the new functon PxVehicleWheelsSimData::setMinLongSlipDenominator.</li>
+					<li>PxVehicleTireData::mCamberStiffness has been replaced with PxVehicleTireData::mCamberStiffnessPerUnitGravity.  PxVehicleTireData::mCamberStiffnessPerUnitGravity should be set so that it is equivalent to the old value of PxVehicleTireData::mCamberStiffness divided by the magnitude of gravitational acceleration.</li>
+					<li>PxVehicleComputeTireForceDefault has been removed from the public vehicle api. Custom tire shaders that call PxVehicleComputeTireForceDefault are best implemented by taking a copy of PxVehicleComputeTireForceDefault and calling the copy instead.</li>
+				</ul>
+				<li>Fixed:</li>
+				<ul>
+					<li>Sticky tire friction is now activated in the tire's lateral direction at the tire force application point when the velocity at the base of the tire has low longitudinal and low lateral components. Longitudinal sticky tire friction is unaffected and is still activated when the vehicle has low forward speed. This fixes a minor bug where vehicles positioned on a slope perpendicular to the slope's downwards direction can slowly drift down the slope.</li>
+					<li>Bugs in the suspension force and tire load computation have been fixed that affected handling when the car was upside down.</li>
+					<li>The tire load passed to the tire force computation is now clamped so that it never falls below zero.</li>
+					<li>A bug in the tank damping forces has now been fixed.  Tanks now slow down more aggressively from engine and wheel damping forces.</li>
+				</ul>
+			</ul>
 
-	<h4>Particles</h4>
-	<ul>
-		<li>Creating and immediately setting the position of particles failed and possibly crashed with GPU acceleration enabled. This would only happen after one or more simulation updates.</li>
-	</ul>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-</blockquote>
 
-<h2><br><a name="platforms324">Supported Platforms</a></h2>
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.4</h1>
 
-<blockquote>
-	Unchanged from <a href="#platforms323"> from 3.2.3</a> except:
-    <h4>Development</h4>
-    <ul>
-		<li>Upgraded to Xcode 4.6</li>
-    </ul>
-</blockquote>
+			<h2 style="TEXT-ALIGN: center">April 2013</h2>
 
-<h2><br>
-<a name="knownissues324">Known Issues And Limitations</a></h2>
+			<h2>What's New In NVIDIA PhysX 3.2.4</h2>
 
-<blockquote>
-	Unchanged from <a href="#knownissues323"> from 3.2.3</a>.
-</blockquote>
-<br>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
+					<li>Fixed a bug which caused actors to return wrong world bounds if the bounds minimum was above 10000 on any axis.</li>
+					<li>Reporting allocation names can now be enabled or disabled (see PxFoundation::setReportAllocationNames). When enabled, some platforms allocate memory through 'malloc'.</li>
+					<li>eEXCEPTION_ON_STARTUP is removed from PxErrorCode and it is no longer needed.</li>
+					<li>Added boilerplate.txt to the Tools folder. SpuShaderDump.exe and clang.exe require it.</li>
+					<li>PxWindowsDelayLoadHook.h has been moved from Include/foundation/windows to Include/common/windows.</li>
+					<li>PxScene::saveToDesc now reports the bounceThresholdVelocity value.</li>
+					<li>Fixed a bug in PxDefaultSimulationFilterShader: the value of the second filter constant in the collision filtering equation was ignored and instead the value of the first filter constant was used.</li>
+					<li>Fixed a crash bug in PCM collision.</li>
+				</ul>
 
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Forces applied to bodies (with PxRigidBody::addForce) that go to sleep in the subsequent update now have their applied forces cleared when the body is set to sleep to avoid them being applied in a later update when the body is once more awake.  This bug broke the rule that forces applied with PxRigidBody::addForce do not persist beyond the next scene update.</li>
+					<li>Jittering of small spheres & capsules on very large triangles has been fixed.</li>
+				</ul>
 
+				<h4>Cooking</h4>
+				<ul>
+					<li>PxCookingParams constructor is now marked as deprecated. PxToleranceScale is needed for PxCookingParams in order to perform additional triangle check during cooking. This triangle check will trigger warning if too huge triangles are used. This check will ensure better simulation stability.</li>
+				</ul>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.3</h1>
+				<h4>Scene Queries</h4>
+				<ul>
+					<li>Added PxScene::forceDynamicTreeRebuild() function to immediately rebuild the scene query structures.</li>
+					<li>Accuracy in sphere/capsule-vs-triangle overlap and sweep tests has been improved.</li>
+				</ul>
 
-<h2 style="TEXT-ALIGN: center">November 2012</h2>
+				<h4>Broad Phase</h4>
+				<ul>
+					<li>Fixed assert in debug mode that wrongly asserted when an overlap pair was removed with perfect equality between the min of one bound and the max of the other along at least one axis.</li>
+				</ul>
 
-<h2>What's New In NVIDIA PhysX 3.2.3</h2>
+				<h4>Character controller</h4>
+				<ul>
+					<li>PxControllerFilterCallback has been added, to make CCT-vs-CCT filtering more flexible.</li>
+					<li>Fixed a bug where PxControllerBehaviorCallback was not called when a PxUserControllerHitReport was not defined.</li>
+					<li>PxObstacle notifications has been added to handle touched obstacles.</li>
+					<li>Touched PxObstacle has been replaced by touched ObstacleHandle.</li>
+					<li>PxObstacleContext::getObstacleByHandle has been added.</li>
+					<li>Touched actors scene is checked before ride on shape, to ensure valid touched shape.</li>
+					<li>Touched shape scene query flag is checked before ride on shape, to ensure valid touched shape.</li>
+					<li>Touched shape user CCT filtering is triggered before ride on shape, to ensure valid touched shape.</li>
+					<li>Box & capsule controllers' resize function now properly take the up direction into account</li>
+				</ul>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
-		<li>Quaternions passed through the API are now considered valid if their magnitude is between 0.99 and 1.01.</li>
-		<li>Fixed crash when running out of memory on creation of a triangle mesh.</li>
-		<li>For applications using floating point exceptions, the SDK will now mask or avoid exceptions arising from invalid floating point operations (inexact and underflow exceptions may still be generated).</li>
-		<li>Fixed a bug with recursive use of the PxScene read/write lock.</li>
-		<li>Fixed a shutdown bug with very short lived threads on Linux platforms.</li>
-		<li>PhysX version number in error messages now printed in hex for easier reading.</li>
-		<li>Fixed memory barrier and prefetch implementation for all posix based platforms (android, ios, osx, linux).</li>
-	</ul>
+				<h4>Vehicles</h4>
+				<ul>
+					<li>Documented potential problem with PxVehicleWheelsDynData::getTireDrivableSurfaceType() and PxVehicleWheelsDynData::getTireDrivableSurfaceShape() whereby the pointer returned may reference a PxShape or PxMaterial that has been released in-between storing the pointer in PxVehicleUpdates and any subsequent query.</li>
+					<li>PxVehicleWheelsSimData::disableWheel has been documented in more detail.  PxVehicleWheelsSimData::enableWheel has been added.</li>
+					<li>Fixed a bug where the denominator of the longitudinal slip calculation didn't take into account the value of PxTolerancesScale::length.  This will only have an impact if PxTolerancesScale::length != 1.0f.</li>
+					<li>Fixed a bug where the engine torque would be incorrectly applied if PxTolerancesScale::length != 1.0f.  This will only have an impact if PxTolerancesScale::length != 1.0f.</li>
+				</ul>
 
-	<h4>Broad Phase</h4>
-	<ul>
-		<li>Fixed a broad phase crash bug that occurred when deleting shapes with bounds very far from the origin.</li>
-	</ul>
+				<h4>Particles</h4>
+				<ul>
+					<li>Creating and immediately setting the position of particles failed and possibly crashed with GPU acceleration enabled. This would only happen after one or more simulation updates.</li>
+				</ul>
 
+			</blockquote>
 
-	<h4>Collision Detection</h4>
-	<ul>
-		<li>Documentation of limits of PxShape counts has been added for affected platforms.</li>
-		<li>Made kinematics interact better with CCD.</li>
-		<li>Adding support for disabled contact response in CCD by respecting the dominance setting.  In this case, CCD will emit events but will not alter the motion of the bodies.</li>
-		<li>Fixed potential crash in eENABLE_PCM codepath.</li>
-	</ul>
+			<h2><br><a name="platforms324">Supported Platforms</a></h2>
 
+			<blockquote>
+				Unchanged from <a href="#platforms323"> from 3.2.3</a> except:
+				<h4>Development</h4>
+				<ul>
+					<li>Upgraded to Xcode 4.6</li>
+				</ul>
+			</blockquote>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>Fixed bug in force based contact reports. An assert could occur when PxPairFlag::eNOTIFY_THRESHOLD_FORCE_PERSISTS was set and PxPairFlag::eNOTIFY_THRESHOLD_FORCE_FOUND was not set.</li>
-		<li>Twist Limit range is documented for revolute and D6 joints, and validated.</li>
-		<li>Reduced the number of SDK allocations when using CCD.</li>
-	</ul>
+			<h2><br><a name="knownissues324">Known Issues And Limitations</a></h2>
 
+			<blockquote>
+				Unchanged from <a href="#knownissues323"> from 3.2.3</a>.
+			</blockquote>
+			<br>
 
-	<h4>Scene Queries</h4>
-	<ul>
-        <li>Raycasts against heighfields now return correct actual mesh index, which can be used for getTriangle().</li>
-        <li>Fixed bug that caused scene queries to miss hits for static rigid bodies that got moved around (after having been added to the scene).</li>
-        <li>Fixed rebuilding the dynamic structure properly when used for static rigid bodies.</li>
-        <li>Fixed a rare crash in heightfield raycast code.</li>
-	</ul>
 
-	<h4>Character controller</h4>
-	<ul>
-		<li>A new runtime tessellation feature has been added, that can help reducing FPU accuracy issues in the sweep tests.</li>
-	</ul>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-	<h4>Convex hulls</h4>
-	<ul>
-		<li>Zero-sized convex hull data double delete detection fix.</li>
-	</ul>
 
-	<h4>Vehicles</h4>
-	<ul>
-		<li>Vehicles with rigid body actors that are asleep and also have no acceleration or steer inputs are treated as though they are asleep too; that is, they are bypassed completely in the PxVehicleUpdates function.  Vehicles with sleeping rigid body actors but with non-zero acceleration or steer inputs are processed as normal in PxVehicleUpdates and will automatically have their rigid body actor woken up.</li>
-		<li>New function PxVehicleSetUpdateMode to allow PxVehicleUpdates to select between applying accelerations to vehicle rigid bodies or immediate updating of their velocity.</li>
-	</ul>
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.3</h1>
 
-	<h4>Particles</h4>
-	<ul>
-		<li>Fixed a non-deterministic crash appearing with rigid bodies using CCD and gpu particles in the same scene.</li>
-	</ul>
+			<h2 style="TEXT-ALIGN: center">November 2012</h2>
 
-	<h4>Physx Visual Debugger</h4>
-	<ul>
-		<li>Material release events are now correctly sent to PVD. </li>
-	</ul>
+			<h2>What's New In NVIDIA PhysX 3.2.3</h2>
 
-        <h4>RepX</h4>
-	<ul>
-		<li>Add more RepX class information in PhysXAPI document. </li>
-	</ul>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
+					<li>Quaternions passed through the API are now considered valid if their magnitude is between 0.99 and 1.01.</li>
+					<li>Fixed crash when running out of memory on creation of a triangle mesh.</li>
+					<li>For applications using floating point exceptions, the SDK will now mask or avoid exceptions arising from invalid floating point operations (inexact and underflow exceptions may still be generated).</li>
+					<li>Fixed a bug with recursive use of the PxScene read/write lock.</li>
+					<li>Fixed a shutdown bug with very short lived threads on Linux platforms.</li>
+					<li>PhysX version number in error messages now printed in hex for easier reading.</li>
+					<li>Fixed memory barrier and prefetch implementation for all posix based platforms (android, ios, osx, linux).</li>
+				</ul>
 
-</blockquote>
+				<h4>Broad Phase</h4>
+				<ul>
+					<li>Fixed a broad phase crash bug that occurred when deleting shapes with bounds very far from the origin.</li>
+				</ul>
 
-<h2><br><a name="platforms323">Supported Platforms</a></h2>
 
-<blockquote>
-    <h4>Runtime</h4>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver ver 295.73 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-        <li>Microsoft XBox 360</li>
-        <li>Sony Playstation 3</li>
-    </ul>
-    <h4>Development</h4>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010</li>
-		<li>Xcode 4.5</li>
-    </ul>
-</blockquote>
+				<h4>Collision Detection</h4>
+				<ul>
+					<li>Documentation of limits of PxShape counts has been added for affected platforms.</li>
+					<li>Made kinematics interact better with CCD.</li>
+					<li>Adding support for disabled contact response in CCD by respecting the dominance setting.  In this case, CCD will emit events but will not alter the motion of the bodies.</li>
+					<li>Fixed potential crash in eENABLE_PCM codepath.</li>
+				</ul>
 
-<h2><br>
-<a name="knownissues323">Known Issues And Limitations</a></h2>
 
-<blockquote>
-	Unchanged from <a href="#knownissues322"> from 3.2.2</a>.
-</blockquote>
-<br>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Fixed bug in force based contact reports. An assert could occur when PxPairFlag::eNOTIFY_THRESHOLD_FORCE_PERSISTS was set and PxPairFlag::eNOTIFY_THRESHOLD_FORCE_FOUND was not set.</li>
+					<li>Twist Limit range is documented for revolute and D6 joints, and validated.</li>
+					<li>Reduced the number of SDK allocations when using CCD.</li>
+				</ul>
 
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+				<h4>Scene Queries</h4>
+				<ul>
+					<li>Raycasts against heighfields now return correct actual mesh index, which can be used for getTriangle().</li>
+					<li>Fixed bug that caused scene queries to miss hits for static rigid bodies that got moved around (after having been added to the scene).</li>
+					<li>Fixed rebuilding the dynamic structure properly when used for static rigid bodies.</li>
+					<li>Fixed a rare crash in heightfield raycast code.</li>
+				</ul>
 
+				<h4>Character controller</h4>
+				<ul>
+					<li>A new runtime tessellation feature has been added, that can help reducing FPU accuracy issues in the sweep tests.</li>
+				</ul>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.2</h1>
+				<h4>Convex hulls</h4>
+				<ul>
+					<li>Zero-sized convex hull data double delete detection fix.</li>
+				</ul>
 
-<h2 style="TEXT-ALIGN: center">October 2012</h2>
+				<h4>Vehicles</h4>
+				<ul>
+					<li>Vehicles with rigid body actors that are asleep and also have no acceleration or steer inputs are treated as though they are asleep too; that is, they are bypassed completely in the PxVehicleUpdates function.  Vehicles with sleeping rigid body actors but with non-zero acceleration or steer inputs are processed as normal in PxVehicleUpdates and will automatically have their rigid body actor woken up.</li>
+					<li>New function PxVehicleSetUpdateMode to allow PxVehicleUpdates to select between applying accelerations to vehicle rigid bodies or immediate updating of their velocity.</li>
+				</ul>
 
-<h2>What's New In NVIDIA PhysX 3.2.2</h2>
+				<h4>Particles</h4>
+				<ul>
+					<li>Fixed a non-deterministic crash appearing with rigid bodies using CCD and gpu particles in the same scene.</li>
+				</ul>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
-
-		<li>Added Microsoft Windows RT (formerly known as Windows on ARM) support.</li>
-		<li>Suspended Sony Playstation Vita support.</li>
-		<li>PxScene now exposes methods to make multithreaded sharing of scenes easier, see PxSceneFlags::eREQUIRE_RW_LOCK for details.</li>
-		<li>Enabled Win64 DEBUG build to use SIMD enabled code path.</li>
-		<li>Fixed bug in quaternion to axis/angle routine which failed on negative w values.</li>
-		<li>Fixed crash when using ConvX on a PxCollection with external references.</li>
-		<li>Fixed a spurious overlapping read/write error report when using simulation call backs in checked builds.</li>
-		<li>The bounce threshold velocity can be set at run-time with PxScene::setBounceThresholdVelocity. Likewise, it can be queried with PxScene::getBounceThresholdVelocity</li>
-		<li>Fixed return value of Ps::atomicExchange for POSIX based platforms: Linux, Android, IOS and OSX</li>
-		<li>PxGeometryQuery::computePenetration() has been added, to compute the Minimum Translational Distance between geometry objects.</li>
-	</ul>
+				<h4>Physx Visual Debugger</h4>
+				<ul>
+					<li>Material release events are now correctly sent to PVD. </li>
+				</ul>
+
+				<h4>RepX</h4>
+				<ul>
+					<li>Add more RepX class information in PhysXAPI document. </li>
+				</ul>
+
+			</blockquote>
+
+			<h2><br><a name="platforms323">Supported Platforms</a></h2>
+
+			<blockquote>
+				<h4>Runtime</h4>
+				<ul>
+					<li>Apple iOS</li>
+					<li>Apple Mac OS X</li>
+					<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
+					<li>Linux (tested on Ubuntu)</li>
+					<li>Microsoft Windows XP or later (NVIDIA Driver ver 295.73 or later is required for GPU acceleration)</li>
+					<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
+					<li>Microsoft XBox 360</li>
+					<li>Sony Playstation 3</li>
+				</ul>
+				<h4>Development</h4>
+				<ul>
+					<li>Microsoft Windows XP or later</li>
+					<li>Microsoft Visual Studio 2008, 2010</li>
+					<li>Xcode 4.5</li>
+				</ul>
+			</blockquote>
+
+			<h2><br><a name="knownissues323">Known Issues And Limitations</a></h2>
+
+			<blockquote>
+				Unchanged from <a href="#knownissues322"> from 3.2.2</a>.
+			</blockquote>
+			<br>
+
+
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+
+
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.2</h1>
+
+			<h2 style="TEXT-ALIGN: center">October 2012</h2>
+
+			<h2>What's New In NVIDIA PhysX 3.2.2</h2>
+
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
+
+					<li>Added Microsoft Windows RT (formerly known as Windows on ARM) support.</li>
+					<li>Suspended Sony Playstation Vita support.</li>
+					<li>PxScene now exposes methods to make multithreaded sharing of scenes easier, see PxSceneFlags::eREQUIRE_RW_LOCK for details.</li>
+					<li>Enabled Win64 DEBUG build to use SIMD enabled code path.</li>
+					<li>Fixed bug in quaternion to axis/angle routine which failed on negative w values.</li>
+					<li>Fixed crash when using ConvX on a PxCollection with external references.</li>
+					<li>Fixed a spurious overlapping read/write error report when using simulation call backs in checked builds.</li>
+					<li>The bounce threshold velocity can be set at run-time with PxScene::setBounceThresholdVelocity. Likewise, it can be queried with PxScene::getBounceThresholdVelocity</li>
+					<li>Fixed return value of Ps::atomicExchange for POSIX based platforms: Linux, Android, IOS and OSX</li>
+					<li>PxGeometryQuery::computePenetration() has been added, to compute the Minimum Translational Distance between geometry objects.</li>
+				</ul>
+
+				<h4>Broad Phase</h4>
+				<ul>
+					<li>Fixed a broad phase crash bug.</li>
+				</ul>
+
+				<h4>Collision Detection</h4>
+				<ul>
+					<li>Collision detection now more robust when confronted with ill-conditioned scenarios.</li>
+					<li>Fixed crash when SDK is unable to create more contact pairs.  Now a warning is emitted and the contacts are ignored.</li>
+				</ul>
+
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Improved the numerical stability of articulations.</li>
+					<li>The target pose of a kinematically controlled dynamic actor can now get extracted through PxRigidDynamic::getKinematicTarget().</li>
+					<li>The new flag PxRigidDynamicFlag::eUSE_KINEMATIC_TARGET_FOR_SCENE_QUERIES makes scene queries use the target pose of a kinematically controlled dynamic actor instead of the actual pose.</li>
+					<li>Fixed PxConstraintFlag::eVISUALIZATION having no effect.</li>
+					<li>Fixed bug that caused sleep/wake-up notification events to get lost.</li>
+					<li>Fixed a bug where sleeping objects were not waking up properly.</li>
+					<li>Fixed potential assert when switching a rigid body between kinematic and dynamic with contact reports enabled.</li>
+					<li>Fixed a bug where CCD didn't consider the scaling transform for triangle meshes.</li>
+					<li>Fixed a potential crash bug when PxConstraintFlag::ePROJECTION gets raised after a joint (that connects to other projecting joints) has been created.</li>
+					<li>Fixed a bug that resulted in joint breakage being reported every frame for any broken joint</li>
+					<li>Added PxArticulationDriveCache for applying an impulse to an entire articulation</li>
+					<li>fixed various crash bugs when sleeping articulations were removed from the scene and re-added
+					<li>
+				</ul>
+
+				<h4>GPU Physics</h4>
+				<ul>
+					<li>Fixed a possible out of bounds array access in GPU particle collision code.</li>
+					<li>Updated the GPU PhysX Visual Indicator to allow APEX to hook into it.</li>
+					<li>Fixed sample particles crash when there is a cuda kernel error.</li>
+					<li>Upgraded GPU tech to CUDA 4.2.</li>
+				</ul>
+
+				<h4>Scene Queries</h4>
+				<ul>
+					<li>PxSceneQueryHit::faceIndex is now filled in for sweeps against convex meshes.</li>
+					<li>Added support for sphere and capsule shaped heightfield overlap queries.</li>
+					<li>Added an inflation parameter to all sweep queries in PxGeometryQuery, PxBatchQuery, and PxScene.  This can be used to maintain a minimum distance between objects when moving them using sweeps.</li>
+					<li>Made sure that raycast multiple will include the closest triangle in the results even if the output cannot hold all the triangles that are hit.</li>
+					<li>Fixed swept sphere against capsule not properly testing for initial overlap.</li>
+					<li>Fixed the normal vector returned from sweeps being sometimes negated.</li>
+					<li>Fixed a scenario where raycasting could miss an actor after the user has moved it using setGlobalPose().</li>
+					<li>Fixed swept convex against plane sometimes reporting false hits</li>
+					<li>Fixed swept/overlap/raycast with convexes that don't have identity scale rotation.</li>
+					<li>Fixed a culling bug for box-triangle mesh sweep.</li>
+				</ul>
+
+				<h4>Convex hulls</h4>
+				<ul>
+					<li>Convex hull is rejected if it has less than 4 polygons.</li>
+					<li>Additional convex hull check has been added to detect open volumes.</li>
+				</ul>
+
+				<h4>Triangle meshes</h4>
+				<ul>
+					<li>Added triangle mesh flags for 16bit indices and adjacency information.</li>
+					<li>Fixed adjacency information order for getTriangle with triangle meshes to respect the vertex order.</li>
+				</ul>
+
+				<h4>HeightFields</h4>
+				<ul>
+					<li>Fixed bug where capsules would bounce as they roll across heightfield edges.</li>
+					<li>Fixed bug where spheres would bounce as they roll across heightfield vertices.</li>
+					<li>Added adjacency information for getTriangle with height fields.</li>
+				</ul>
+
+				<h4>Particles</h4>
+				<ul>
+					<li>Fixed triangle mesh shapes with ePARTICLE_DRAIN colliding with particles.</li>
+					<li>Fixed crash with GPU particles when a lot of grid cells get occupied and vacated within one time step.</li>
+				</ul>
+
+				<h4>Character Controller</h4>
+				<ul>
+					<li>The PxControllerBehaviorFlag::eCCT_USER_DEFINED_RIDE flag has been added.</li>
+					<li>Fixed character controller not walking up steps properly if they are not exactly 90 degrees vertical.</li>
+					<li>Fixed a bug where the character controller was not able to move up slopes when using a small step offset setting.</li>
+					<li>Fixed a bug where the character controller could rise off the ground when blocked by a step.</li>
+					<li>Fixed a bug where the character controller could rise off the ground when hitting another character controller.</li>
+					<li>Fixed a bug where releasing a shape of a touching actor and then releasing the character controller would crash. Releasing shapes of actors in touch may still lead to crashes in other situations. PxController::invalidateCache() can be used to work around these situations.</li>
+				</ul>
+				<h4> CCD </h4>
+				<ul>
+					<li>Fixed culling bug in CCD sweeps with meshes with transforms that caused contacts to be missed </li>
+				</ul>
+
+				<h4>Vehicles</h4>
+				<ul>
+					<li>The vehicle sdk used to make the assumption that wheels 0 and 1 (the front wheels) of a PxVehicleDrive4W responded positively to the input steering wheel angle, while wheels 2 and 3 (the rear wheels) responded in the opposite direction to that of the input steering wheel angle.  A consequence of this assumed behaviour was the restriction that PxVehicleWheelData::mMaxSteer was limited to positive values.  This restriction has now been relaxed with the caveat that PxVehicleWheelData::mMaxSteer must be in range (-Pi/2, Pi/2).  It is now possible to turn each wheel positively or negatively relative to the input steering wheel angle by choosing positive or negative values for PxVehicleWheelData::mMaxSteer.  Ackermann steer correction might result in the front and rear wheels competing against each other if the rear and front all steer in the same direction relative to the input steering wheel angle. If this is the case it will be necessary to set the Ackermann accuracy to zero.</li>
+					<li>It is now possible to set the engine rotation speed (PxVehicleDriveDynData::setEngineRotationSpeed), the rotation speed of each wheel (PxVehicleWheelsDynData::setWheelRotationSpeed) and the rotation angle of each wheel (PxVehicleWheelsDynData::setWheelRotationAngle). The default values for each of these properties remains zero.</li>
+					<li>Wheel contact reporting has been improved with the addition of a number of query functions to the PxVehicleWheelsDynData class.  These are getTireDrivableSurfaceContactPoint, getTireDrivableSurfaceContactNormal, getTireLongitudinalDir, getTireLateralDir, getSuspensionForce, getTireDrivableSurfaceShape.</li>
+					<li>It is now possible to store a userData pointer per wheel.  This allows, for example, each wheel to be associated with a game object.  The relevant functions are PxVehicleWheelsDynData::setUserData and PxVehicleWheelsDynData::getUserData.</li>
+					<li>The default behavior of PxVehicleWheels::setWheelShapeMapping has changed. Previously, default values were automatically assigned to each wheel at construction so that the ith wheel was mapped to the ith body shape.  This, however, made the assumption that there was a wheel shape for each wheel, which is not always true.  As a consequence, the default value is now -1, meaning any mapping between body shape and wheel has to be explictily made by calling setWheelShapeMapping.</li>
+					<li>It is now possible to query the mapping between wheel and body shape with PxVehicleWheels::getWheelShapeMapping.</li>
+					<li>It is now possible to query the tire shader data that has been applied to each wheel with PxVehicleWheelsDynData::getTireForceShaderData.</li>
+					<li>The helper function PxVehicleComputeSprungMasses has been added to aid the setup of the suspension sprung masses from the rigid body centre of mass and wheel center coordinates.</li>
+					<li>The scene query filter data applied to the suspension raycasts was previously taken from the filter data of the associated body shape.  This makes the assumption of a body shape per wheel, which is not always true.  As a consequence, the filter data must be explictly set per wheel by calling PxVehicleWheels::setSceneQueryFilterData.  The filter data can be queried with PxVehicleWheels::getSceneQueryFilterData.</li>
+					<li>Sub-stepping of the vehicle update can now be applied per vehicle with PxVehicleWheelsSimData::setSubStepCount.</li>
+					<li>PxVehicleDrivableSurfaceToTireFrictionPairs has been modified so that the dictionary of material pointers can be updated without the necessity of further allocation.  The create function has been replaced with separate allocate and setup functions.</li>
+					<li>A new vehicle type PxVehicleNoDrive has been added to provide a close approximation to backwards compatibility with the api of the 2.8.x NxWheelShape.</li>
+				</ul>
+
+
+				<h4>Visual Remote Debugger</h4>
+				<ul>
+					<li>Added PVD compatible profile zones for batched queries.</li>
+					<li>Added the ability to capture and inspect scene queries in PVD.</li>
+					<li>SDK will now flush the pvd connection stream immediately after cloth or cloth fabric is created or released.</li>
+					<li>Fixed the PVD support for articulations.</li>
+					<li>Fixed PVD rendering wrong constraint limits.</li>
+				</ul>
+
+
+				<h4>Documentation</h4>
+				<ul>
+					<li>Wrong statement in PxRigidStatic::release() has been corrected. Static rigid bodies do wake up touching dynamic rigid bodies on release.</li>
+					<li>Wrong statement in PxShape::setFlag() has been corrected. It is a valid operation to clear all flags.</li>
+					<li>Retroactively added more detail about changes to 3.2.1 release notes below.</li>
+				</ul>
+			</blockquote>
 
-	<h4>Broad Phase</h4>
-	<ul>
-		<li>Fixed a broad phase crash bug.</li>
-	</ul>
+			<h2><br><a name="platforms322">Supported Platforms</a></h2>
 
-	<h4>Collision Detection</h4>
-	<ul>
-		<li>Collision detection now more robust when confronted with ill-conditioned scenarios.</li>
-		<li>Fixed crash when SDK is unable to create more contact pairs.  Now a warning is emitted and the contacts are ignored.</li>
-	</ul>
+			<blockquote>
+				<h4>Runtime</h4>
+				<ul>
+					<li>Apple iOS</li>
+					<li>Apple Mac OS X</li>
+					<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
+					<li>Linux (tested on Ubuntu)</li>
+					<li>Microsoft Windows XP or later (NVIDIA Driver ver 295.73 or later is required for GPU acceleration)</li>
+					<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
+					<li>Microsoft XBox 360</li>
+					<li>Sony Playstation 3</li>
+				</ul>
+				<h4>Development</h4>
+				<ul>
+					<li>Microsoft Windows XP or later</li>
+					<li>Microsoft Visual Studio 2008, 2010</li>
+					<li>Xcode 4.2</li>
+				</ul>
+			</blockquote>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>Improved the numerical stability of articulations.</li>
-		<li>The target pose of a kinematically controlled dynamic actor can now get extracted through PxRigidDynamic::getKinematicTarget().</li>
-		<li>The new flag PxRigidDynamicFlag::eUSE_KINEMATIC_TARGET_FOR_SCENE_QUERIES makes scene queries use the target pose of a kinematically controlled dynamic actor instead of the actual pose.</li>
-		<li>Fixed PxConstraintFlag::eVISUALIZATION having no effect.</li>
-		<li>Fixed bug that caused sleep/wake-up notification events to get lost.</li>
-		<li>Fixed a bug where sleeping objects were not waking up properly.</li>
-		<li>Fixed potential assert when switching a rigid body between kinematic and dynamic with contact reports enabled.</li>
-		<li>Fixed a bug where CCD didn't consider the scaling transform for triangle meshes.</li>
-		<li>Fixed a potential crash bug when PxConstraintFlag::ePROJECTION gets raised after a joint (that connects to other projecting joints) has been created.</li>
-		<li>Fixed a bug that resulted in joint breakage being reported every frame for any broken joint</li>
-		<li>Added PxArticulationDriveCache for applying an impulse to an entire articulation</li>
-		<li>fixed various crash bugs when sleeping articulations were removed from the scene and re-added<li>
-	</ul>
 
-	<h4>GPU Physics</h4>
-	<ul>
-		<li>Fixed a possible out of bounds array access in GPU particle collision code.</li>
-		<li>Updated the GPU PhysX Visual Indicator to allow APEX to hook into it.</li>
-		<li>Fixed sample particles crash when there is a cuda kernel error.</li>
-			<li>Upgraded GPU tech to CUDA 4.2.</li>
-	</ul>
 
-	<h4>Scene Queries</h4>
-	<ul>
-		<li>PxSceneQueryHit::faceIndex is now filled in for sweeps against convex meshes.</li>
-		<li>Added support for sphere and capsule shaped heightfield overlap queries.</li>
-		<li>Added an inflation parameter to all sweep queries in PxGeometryQuery, PxBatchQuery, and PxScene.  This can be used to maintain a minimum distance between objects when moving them using sweeps.</li>
-		<li>Made sure that raycast multiple will include the closest triangle in the results even if the output cannot hold all the triangles that are hit.</li>
-		<li>Fixed swept sphere against capsule not properly testing for initial overlap.</li>
-		<li>Fixed the normal vector returned from sweeps being sometimes negated.</li>
-		<li>Fixed a scenario where raycasting could miss an actor after the user has moved it using setGlobalPose().</li>
-		<li>Fixed swept convex against plane sometimes reporting false hits</li>
-		<li>Fixed swept/overlap/raycast with convexes that don't have identity scale rotation.</li>
-		<li>Fixed a culling bug for box-triangle mesh sweep.</li>
-	</ul>
 
-	<h4>Convex hulls</h4>
-	<ul>
-		<li>Convex hull is rejected if it has less than 4 polygons.</li>
-		<li>Additional convex hull check has been added to detect open volumes.</li>
-	</ul>
+			<h2><br><a name="knownissues322">Known Issues And Limitations</a></h2>
 
-	<h4>Triangle meshes</h4>
-	<ul>
-		<li>Added triangle mesh flags for 16bit indices and adjacency information.</li>
-		<li>Fixed adjacency information order for getTriangle with triangle meshes to respect the vertex order.</li>
-	</ul>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li>Memory leaks might get reported when using the debug versions of malloc and free together with the debug version of PhysX on Microsoft Windows platforms with Visual Studio 2010. This is caused by a bug in the Visual Studio 2010 runtime libraries. If such a leak appears immediately after releasing PhysX and all its components, please contact us for information about workarounds.</li>
+					<li>Use of articulations may require an increase of 64K in the stack size for threads making API calls and engine worker threads</li>
+				</ul>
+			</blockquote>
 
-	<h4>HeightFields</h4>
-	<ul>
-		<li>Fixed bug where capsules would bounce as they roll across heightfield edges.</li>
-		<li>Fixed bug where spheres would bounce as they roll across heightfield vertices.</li>
-		<li>Added adjacency information for getTriangle with height fields.</li>
-	</ul>
+			<blockquote>
+				Please also see the previous lists <a href="#knownissues321"> from 3.2.1</a> and earlier.
+			</blockquote>
 
-	<h4>Particles</h4>
-	<ul>
-		<li>Fixed triangle mesh shapes with ePARTICLE_DRAIN colliding with particles.</li>
-		<li>Fixed crash with GPU particles when a lot of grid cells get occupied and vacated within one time step.</li>
-	</ul>
 
-	<h4>Character Controller</h4>
-	<ul>
-		<li>The PxControllerBehaviorFlag::eCCT_USER_DEFINED_RIDE flag has been added.</li>
-		<li>Fixed character controller not walking up steps properly if they are not exactly 90 degrees vertical.</li>
-		<li>Fixed a bug where the character controller was not able to move up slopes when using a small step offset setting.</li>
-		<li>Fixed a bug where the character controller could rise off the ground when blocked by a step.</li>
-		<li>Fixed a bug where the character controller could rise off the ground when hitting another character controller.</li>
-		<li>Fixed a bug where releasing a shape of a touching actor and then releasing the character controller would crash. Releasing shapes of actors in touch may still lead to crashes in other situations. PxController::invalidateCache() can be used to work around these situations.</li>
-	</ul>
-	<h4> CCD </h4>
-	<ul>
-		<li>Fixed culling bug in CCD sweeps with meshes with transforms that caused contacts to be missed </li>
-	</ul>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-	<h4>Vehicles</h4>
-	<ul>
-		<li>The vehicle sdk used to make the assumption that wheels 0 and 1 (the front wheels) of a PxVehicleDrive4W responded positively to the input steering wheel angle, while wheels 2 and 3 (the rear wheels) responded in the opposite direction to that of the input steering wheel angle.  A consequence of this assumed behaviour was the restriction that PxVehicleWheelData::mMaxSteer was limited to positive values.  This restriction has now been relaxed with the caveat that PxVehicleWheelData::mMaxSteer must be in range (-Pi/2, Pi/2).  It is now possible to turn each wheel positively or negatively relative to the input steering wheel angle by choosing positive or negative values for PxVehicleWheelData::mMaxSteer.  Ackermann steer correction might result in the front and rear wheels competing against each other if the rear and front all steer in the same direction relative to the input steering wheel angle. If this is the case it will be necessary to set the Ackermann accuracy to zero.</li>
-		<li>It is now possible to set the engine rotation speed (PxVehicleDriveDynData::setEngineRotationSpeed), the rotation speed of each wheel (PxVehicleWheelsDynData::setWheelRotationSpeed) and the rotation angle of each wheel (PxVehicleWheelsDynData::setWheelRotationAngle). The default values for each of these properties remains zero.</li>
-		<li>Wheel contact reporting has been improved with the addition of a number of query functions to the PxVehicleWheelsDynData class.  These are getTireDrivableSurfaceContactPoint, getTireDrivableSurfaceContactNormal, getTireLongitudinalDir, getTireLateralDir, getSuspensionForce, getTireDrivableSurfaceShape.</li>
-		<li>It is now possible to store a userData pointer per wheel.  This allows, for example, each wheel to be associated with a game object.  The relevant functions are PxVehicleWheelsDynData::setUserData and PxVehicleWheelsDynData::getUserData.</li>
-		<li>The default behavior of PxVehicleWheels::setWheelShapeMapping has changed. Previously, default values were automatically assigned to each wheel at construction so that the ith wheel was mapped to the ith body shape.  This, however, made the assumption that there was a wheel shape for each wheel, which is not always true.  As a consequence, the default value is now -1, meaning any mapping between body shape and wheel has to be explictily made by calling setWheelShapeMapping.</li>
-		<li>It is now possible to query the mapping between wheel and body shape with PxVehicleWheels::getWheelShapeMapping.</li>
-		<li>It is now possible to query the tire shader data that has been applied to each wheel with PxVehicleWheelsDynData::getTireForceShaderData.</li>
-		<li>The helper function PxVehicleComputeSprungMasses has been added to aid the setup of the suspension sprung masses from the rigid body centre of mass and wheel center coordinates.</li>
-		<li>The scene query filter data applied to the suspension raycasts was previously taken from the filter data of the associated body shape.  This makes the assumption of a body shape per wheel, which is not always true.  As a consequence, the filter data must be explictly set per wheel by calling PxVehicleWheels::setSceneQueryFilterData.  The filter data can be queried with PxVehicleWheels::getSceneQueryFilterData.</li>
-		<li>Sub-stepping of the vehicle update can now be applied per vehicle with PxVehicleWheelsSimData::setSubStepCount.</li>
-		<li>PxVehicleDrivableSurfaceToTireFrictionPairs has been modified so that the dictionary of material pointers can be updated without the necessity of further allocation.  The create function has been replaced with separate allocate and setup functions.</li>
-		<li>A new vehicle type PxVehicleNoDrive has been added to provide a close approximation to backwards compatibility with the api of the 2.8.x NxWheelShape.</li>
-	</ul>
 
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.1</h1>
 
-	<h4>Visual Remote Debugger</h4>
-	<ul>
-		<li>Added PVD compatible profile zones for batched queries.</li>
-		<li>Added the ability to capture and inspect scene queries in PVD.</li>
-		<li>SDK will now flush the pvd connection stream immediately after cloth or cloth fabric is created or released.</li>
-        <li>Fixed the PVD support for articulations.</li>
-        <li>Fixed PVD rendering wrong constraint limits.</li>
-	</ul>
+			<h2 style="TEXT-ALIGN: center">June 2012</h2>
 
+			<h2>What's New In NVIDIA PhysX 3.2.1</h2>
 
-	<h4>Documentation</h4>
-	<ul>
-		<li>Wrong statement in PxRigidStatic::release() has been corrected. Static rigid bodies do wake up touching dynamic rigid bodies on release.</li>
-		<li>Wrong statement in PxShape::setFlag() has been corrected. It is a valid operation to clear all flags.</li>
-		<li>Retroactively added more detail about changes to 3.2.1 release notes below.</li>
-	</ul>
-</blockquote>
-
-<h2><br><a name="platforms322">Supported Platforms</a></h2>
-
-<blockquote>
-    <h4>Runtime</h4>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver ver 295.73 or later is required for GPU acceleration)</li>
-		<li>Microsoft Windows RT (formerly known as Windows on ARM) (SDK only, no samples yet)</li>
-        <li>Microsoft XBox 360</li>
-        <li>Sony Playstation 3</li>
-    </ul>
-    <h4>Development</h4>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010</li>
-		<li>Xcode 4.2</li>
-    </ul>
-</blockquote>
-
-
-
-
-<h2><br>
-<a name="knownissues322">Known Issues And Limitations</a></h2>
-
-<blockquote>
-	<h4>General</h4>
-	<ul>
-	    <li>Memory leaks might get reported when using the debug versions of malloc and free together with the debug version of PhysX on Microsoft Windows platforms with Visual Studio 2010. This is caused by a bug in the Visual Studio 2010 runtime libraries. If such a leak appears immediately after releasing PhysX and all its components, please contact us for information about workarounds.</li>
-		<li>Use of articulations may require an increase of 64K in the stack size for threads making API calls and engine worker threads</li>
-	</ul>
-</blockquote>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
+					<li>Added GRB hooks for APEX 1.2.1.</li>
+					<li>Some incorrect usages of __restrict have been fixed.</li>
+					<li>A crash when the user's code had FPU exceptions enabled has been fixed.</li>
+					<li>A rare crash on Win64 has been fixed.</li>
+					<li>Removed no longer needed RapidXML library from distribution.</li>
+					<li>Binary serialization can now save the names of actors and shapes.</li>
+					<li>Removed a RepXUtility.h reinterpret_cast compile warning that happened on some platforms.</li>
+					<li>Fixed a spurious overlapping read/write error report when using simulation call backs in checked builds.</li>
+					<li>Fixed a bug in PxBinaryConverter when processing PxConvexMesh and PxMaterial assets. </li>
+					<li>Fixed bug in implementations of array accessors (PxPhysics::getScenes(), GuMeshFactory::getTriangleMeshes(), GuMeshFactory::getConvexMeshes(), GuMeshFactory::getHeightFields(), PxAggregate::getActors(), PxScene::getAggregates(), PxScene::getArticulations(), PxScene::getConstraints()  ) when startIndex > bufferSize.</li>
+					<li>Reduced the number of libraries provided for game consoles. The following core libraries are included in the PhysX3 library and are not available separately anymore: LowLevel, LowLevelCloth, PhysX3Common, PhysXProfileSDK, PhysXVisualDebuggerSDK, PvdRuntime, PxTask, SceneQuery, SimulationController.</li>
+				</ul>
 
-<blockquote>
-	Please also see the previous lists <a href="#knownissues321"> from 3.2.1</a> and earlier.
-</blockquote>
+				<h4>Documentation</h4>
+				<ul>
+					<li>Clarified documentation regarding use of eSEND_SLEEP_NOTIFIES flag.</li>
+					<li>Clarified documentation regarding using different CRT libraries.</li>
+					<li>Removed some confusing statements about meta data from the documentation.</li>
+					<li>Updated PsPool, PsSort so they can use the user allocator.</li>
+				</ul>
 
+				<h4>Mesh Cooking</h4>
+				<ul>
+					<li>A warning message about negative volumes in the convex mesh cooker could cause a crash.</li>
+					<li>Fixed failure to create valid convex hull in cooking when using PxConvexFlag::eINFLATE_CONVEX.</li>
+					<li>The convex mesh cooking has been made more robust and properly outputs some error messages that were previously missing.</li>
+					<li>Fixed crash bug in x64 version of ClothEdgeQuadifier.</li>
+					<li>Adjacency information option for TriangleMeshes.  This is controlled using the PxCookingParams::buildTriangleAdjacencies flag, and returned if available by PxMeshQuery::getTriangle().</li>
+				</ul>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+				<h4>Broad Phase</h4>
+				<ul>
+					<li>The sdk gracefully handles the case of more than 65536 broadphase pairs and reports a warning that some contacts will be dropped in the event that this limit is exceeded.  This affects all platforms except win32/win64/linux/linux64, which support a maximum number of 4294967296 pairs.</li>
+				</ul>
 
+				<h4>Collision Detection</h4>
+				<ul>
+					<li>Fixed a memory corruption bug in heightfield code.</li>
+					<li>Fixed a bug in sphere vs mesh contact generation that could result in bad normals.</li>
+					<li>Added a flag to enable or disable contact caching, to permit users to make a performance vs memory tradeoff.</li>
+					<li>Fixed a crash bug in ContactReport cleanup code.</li>
+				</ul>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2.1</h1>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>The simultaneous raising of both the PxShapeFlag::eSIMULATION_SHAPE and PxShapeFlag::eTRIGGER_SHAPE flags is now explicitly forbidden by the sdk.  If any of the two is raised then any attempt to raise the other is rejected by the sdk and an error is passed to the error stream. </li>
+				</ul>
 
-<h2 style="TEXT-ALIGN: center">June 2012</h2>
+				<h4>Articulations</h4>
+				<ul>
+					<li>The API stubbed in 3.2.0 for applying an impulse to an entire articulation is now implemented.</li>
+				</ul>
 
-<h2>What's New In NVIDIA PhysX 3.2.1</h2>
+				<h4>GPU Physics</h4>
+				<ul>
+					<li>Much more specific error messages for CUDA compute capability related failures.</li>
+					<li>Added SM35 support for GK110 GPUs.</li>
+					<li>The CUDA contexts provided by the gpu dispatcher can now be shared across scenes.</li>
+					<li>Fixed a possible out of bounds array access in GPU particle collision code.</li>
+				</ul>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li><i>Note: PS3 platform specific changes can be found in the PS3 readme file.</i></li>
-		<li>Added GRB hooks for APEX 1.2.1.</li>
-		<li>Some incorrect usages of __restrict have been fixed.</li>
-		<li>A crash when the user's code had FPU exceptions enabled has been fixed.</li>
-		<li>A rare crash on Win64 has been fixed.</li>
-		<li>Removed no longer needed RapidXML library from distribution.</li>
-		<li>Binary serialization can now save the names of actors and shapes.</li>
-		<li>Removed a RepXUtility.h reinterpret_cast compile warning that happened on some platforms.</li>
-		<li>Fixed a spurious overlapping read/write error report when using simulation call backs in checked builds.</li>
-		<li>Fixed a bug in PxBinaryConverter when processing PxConvexMesh and PxMaterial assets. </li>
-		<li>Fixed bug in implementations of array accessors (PxPhysics::getScenes(), GuMeshFactory::getTriangleMeshes(), GuMeshFactory::getConvexMeshes(), GuMeshFactory::getHeightFields(), PxAggregate::getActors(), PxScene::getAggregates(), PxScene::getArticulations(), PxScene::getConstraints()  ) when startIndex > bufferSize.</li>
-		<li>Reduced the number of libraries provided for game consoles. The following core libraries are included in the PhysX3 library and are not available separately anymore: LowLevel, LowLevelCloth, PhysX3Common, PhysXProfileSDK, PhysXVisualDebuggerSDK, PvdRuntime, PxTask, SceneQuery, SimulationController.</li>
-	</ul>
 
-	<h4>Documentation</h4>
-	<ul>
-		<li>Clarified documentation regarding use of eSEND_SLEEP_NOTIFIES flag.</li>
-		<li>Clarified documentation regarding using different CRT libraries.</li>
-		<li>Removed some confusing statements about meta data from the documentation.</li>
-		<li>Updated PsPool, PsSort so they can use the user allocator.</li>
-	</ul>
+				<h4>Scene Queries</h4>
+				<ul>
+					<li>Resolved poor GJK sweep convergence.</li>
+					<li>Batched queries accept sphere geometry for sweeps.</li>
+					<li>Optimized performance of raycasts.</li>
+					<li>An internal limit of 65536 objects in the scene-query subsytem has been lifted.</li>
+					<li>Fixed a bug where raycasts that sliced through two adjoining heightfields did not return correct results.</li>
+					<li>Fixed a crash bug in PxFindOverlapTriangleMeshUtil when doing a sphere overlap query against a heightfield.</li>
+				</ul>
 
-	<h4>Mesh Cooking</h4>
-	<ul>
-		<li>A warning message about negative volumes in the convex mesh cooker could cause a crash.</li>
-		<li>Fixed failure to create valid convex hull in cooking when using PxConvexFlag::eINFLATE_CONVEX.</li>
-		<li>The convex mesh cooking has been made more robust and properly outputs some error messages that were previously missing.</li>
-		<li>Fixed crash bug in x64 version of ClothEdgeQuadifier.</li>
-		<li>Adjacency information option for TriangleMeshes.  This is controlled using the PxCookingParams::buildTriangleAdjacencies flag, and returned if available by PxMeshQuery::getTriangle().</li>
-	</ul>
 
-	<h4>Broad Phase</h4>
-	<ul>
-		<li>The sdk gracefully handles the case of more than 65536 broadphase pairs and reports a warning that some contacts will be dropped in the event that this limit is exceeded.  This affects all platforms except win32/win64/linux/linux64, which support a maximum number of 4294967296 pairs.</li>
-	</ul>
+				<h4>Cloth</h4>
+				<ul>
+					<li>PxCloth::setMotionConstraints() now works with NULL parameter.</li>
+				</ul>
 
-	<h4>Collision Detection</h4>
-	<ul>
-		<li>Fixed a memory corruption bug in heightfield code.</li>
-		<li>Fixed a bug in sphere vs mesh contact generation that could result in bad normals.</li>
-		<li>Added a flag to enable or disable contact caching, to permit users to make a performance vs memory tradeoff.</li>
-		<li>Fixed a crash bug in ContactReport cleanup code.</li>
-	</ul>
+				<h4>Character Controller</h4>
+				<ul>
+					<li>PhysX CCT code no longer sets PxShape::userData.  </li>
+					<li>Intersection of pairs of CCTs now uses the supplied filter data and the supplied callback prefilter.  The callback postfilter is not yet hooked up.</li>
+					<li>A bug has been fixed whereby the filterData was ignored in one of the scene queries initiated by the PhysX CCT code.</li>
+					<li>Introduced a more automatic mechanism for invelidating the character controller's scene cache.  As part of this, PxController::reportSceneChanged() was replaced with PxController::invalidateCache().</li>
+					<li>Added helper functions PxController::get/setFootPosition() to let user specify the bottom point of the character controller, rather than the center.</li>
+					<li>A new helper function, PxController::resize(), has been added to facilitate character controller resizing.</li>
+					<li>PxControllerBehaviorFlag::eCCT_CAN_RIDE_ON_OBJECT is not the default behavior anymore.</li>
+					<li>The slope limit is only observed when walking on static convex meshes, static triangle meshes, static boxes and static heightfields.  The slope limit is not observed when walking on dynamic or kinematic rigid bodies or static capsules or static spheres.  This partially fixes a bug where the slope limit was inadvertently considered for shapes attached to dynamic rigid bodies and inadvertently ignored for boxes attached to static shapes.</li>
+				</ul>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>The simultaneous raising of both the PxShapeFlag::eSIMULATION_SHAPE and PxShapeFlag::eTRIGGER_SHAPE flags is now explicitly forbidden by the sdk.  If any of the two is raised then any attempt to raise the other is rejected by the sdk and an error is passed to the error stream. </li>
-	</ul>
+				<h4>Vehicles</h4>
+				<ul>
+					<li>The vehicle sdk now reports an error to the error stream and exits from PxVehicleUpates if PxInitVehicleSdk has not been called in advance.  This avoids a divide-by-zero error that can arise if the vehicle sdk has not been initialised correctly.</li>
+				</ul>
 
-	<h4>Articulations</h4>
-	<ul>
-		<li>The API stubbed in 3.2.0 for applying an impulse to an entire articulation is now implemented.</li>
-	</ul>
+				<h4>Visual Debugger</h4>
+				<ul>
+					<li>Releasing of cloth fabrics is reported to the VRD.</li>
+					<li>Ext::Joint::setActors() call now reported to PVD.</li>
+					<li>Fixed crash bug when removing an aggregate containing a PxArticulation, while PVD is running.</li>
+				</ul>
+			</blockquote>
 
-	<h4>GPU Physics</h4>
-	<ul>
-		<li>Much more specific error messages for CUDA compute capability related failures.</li>
-		<li>Added SM35 support for GK110 GPUs.</li>
-		<li>The CUDA contexts provided by the gpu dispatcher can now be shared across scenes.</li>
-		<li>Fixed a possible out of bounds array access in GPU particle collision code.</li>
-	</ul>
+			<h2><br>Supported Platforms</h2>
 
+			<blockquote>
+				Unchanged from <a href="#platforms320"> from 3.2</a>.
+			</blockquote>
 
-	<h4>Scene Queries</h4>
-	<ul>
-		<li>Resolved poor GJK sweep convergence.</li>
-		<li>Batched queries accept sphere geometry for sweeps.</li>
-		<li>Optimized performance of raycasts.</li>
-		<li>An internal limit of 65536 objects in the scene-query subsytem has been lifted.</li>
-		<li>Fixed a bug where raycasts that sliced through two adjoining heightfields did not return correct results.</li>
-		<li>Fixed a crash bug in PxFindOverlapTriangleMeshUtil when doing a sphere overlap query against a heightfield.</li>
-	</ul>
+			<h2><br><a name="knownissues321">Known Issues And Limitations</a></h2>
 
+			<blockquote>
+				Unchanged from <a href="#knownissues320"> from 3.2</a>.
+			</blockquote>
 
-	<h4>Cloth</h4>
-	<ul>
-		<li>PxCloth::setMotionConstraints() now works with NULL parameter.</li>
-	</ul>
 
-	<h4>Character Controller</h4>
-	<ul>
-		<li>PhysX CCT code no longer sets PxShape::userData.  </li>
-		<li>Intersection of pairs of CCTs now uses the supplied filter data and the supplied callback prefilter.  The callback postfilter is not yet hooked up.</li>
-		<li>A bug has been fixed whereby the filterData was ignored in one of the scene queries initiated by the PhysX CCT code.</li>
-		<li>Introduced a more automatic mechanism for invelidating the character controller's scene cache.  As part of this, PxController::reportSceneChanged() was replaced with PxController::invalidateCache().</li>
-		<li>Added helper functions PxController::get/setFootPosition() to let user specify the bottom point of the character controller, rather than the center.</li>
-		<li>A new helper function, PxController::resize(), has been added to facilitate character controller resizing.</li>
-		<li>PxControllerBehaviorFlag::eCCT_CAN_RIDE_ON_OBJECT is not the default behavior anymore.</li>
-		<li>The slope limit is only observed when walking on static convex meshes, static triangle meshes, static boxes and static heightfields.  The slope limit is not observed when walking on dynamic or kinematic rigid bodies or static capsules or static spheres.  This partially fixes a bug where the slope limit was inadvertently considered for shapes attached to dynamic rigid bodies and inadvertently ignored for boxes attached to static shapes.</li>
-	</ul>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-	<h4>Vehicles</h4>
-	<ul>
-		<li>The vehicle sdk now reports an error to the error stream and exits from PxVehicleUpates if PxInitVehicleSdk has not been called in advance.  This avoids a divide-by-zero error that can arise if the vehicle sdk has not been initialised correctly.</li>
-	</ul>
 
-	<h4>Visual Debugger</h4>
-	<ul>
-		<li>Releasing of cloth fabrics is reported to the VRD.</li>
-		<li>Ext::Joint::setActors() call now reported to PVD.</li>
-		<li>Fixed crash bug when removing an aggregate containing a PxArticulation, while PVD is running.</li>
-	</ul>
-</blockquote>
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2</h1>
 
-<h2><br>Supported Platforms</h2>
+			<h2 style="TEXT-ALIGN: center">December 2011</h2>
 
-<blockquote>
-	Unchanged from <a href="#platforms320"> from 3.2</a>.
-</blockquote>
+			<h2>What's New In NVIDIA PhysX 3.2</h2>
 
-<h2><br>
-<a name="knownissues321">Known Issues And Limitations</a></h2>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li>Three new sample applications:  SampleCharacterCloth (character with cloth cape and cloth flags), SampleBridges (character controller walking on dynamic bridges and moving platforms), SampleCustomGravity (character controller with arbitrary up vector).</li>
+					<li>On Windows, the PxFoundation instance is now a process wide singleton and part of the new PhysX3Common.dll library</li>
+					<li>PxCreatePhysics() does not create a PxFoundation instance any longer. The PxFoundation instance has to be created in advance through PxCreateFoundation().</li>
+					<li>Calls to PxCreatePhysics() are not valid anymore if a PxPhysics instance already exists.</li>
+					<li>If profiling information should be sent to the PhysX Visual Debugger, a PxProfileZoneManager instance has to be provided when creating the PxPhysics instance.</li>
+					<li>The version number constant PX_PUBLIC_FOUNDATION_VERSION has been replaced with PX_PHYSICS_VERSION. Both PxFoundation and PxPhysics use the same version number now.</li>
+					<li>The API now distinguishes between input and output stream types.</li>
+					<li>Added mechanism to reduce code size by not linking optional components.  See PxCreateBasePhysics() and the PxRegister*() functions.</li>
+					<li>Added getConcreteTypeName() to API classes to provide run time type information.</li>
+					<li>Added PxScene::getTimestamp() to retrieve the simulation counter.</li>
+					<li>PxGetFoundation has been moved to PxGetFoundation.h</li>
+					<li>Changed the functions PxPhysics::releaseUserReferences(), releaseCollection(), addCollection() and releaseCollected() to now take a reference rather than a pointer.</li>
+					<li>The signature of PxCreatePhysics has changed:  The Foundation SDK instance must be passed in explicitly.  One can also hook profiling information by passing a PxProfileZoneManager.</li>
+					<li>Platform conversion for serialized data has been moved from the ConvX command line tool to the PxBinaryConverter interface in the cooking library</li>
+					<li>contact data block allocation now provides statistics on usage and max usage</li>
+					<li>On all platforms except PS3, contact data blocks can be progressively allocated</li>
+					<li>PxExtensionVisualDebugger has been renamed to PxVisualDebuggerExt, PxExtensionsConnectionType renamed to PxVisualDebuggerConnectionFlag</li>
+					<li>Default implementations of memory and file streams added in PxDefaultStreams.h</li>
+					<li>Renamed PxPhysics::getMetaData() to ::PxGetSDKMetaData().</li>
+					<li>Scene::simulate() now takes a memory block which is used for allocation of temporary data during simulation</li>
+					<li>On Windows, CudaContextManagerDesc support appGUID now. It only works on release build. If your application employs PhysX modules that use CUDA you need to use a GUID so that patches for new architectures can be released for your game.You can obtain a GUID for your application from Nvidia.</li>
+				</ul>
 
-<blockquote>
-	Unchanged from <a href="#knownissues320"> from 3.2</a>.
-</blockquote>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Introduced a new contact generation mode, see eENABLE_PCM.  Note that this is an experimental feature that still shows simulation artifacts in some scenarios.</li>
+					<li>Introduced two new friction simulation modes, see eENABLE_ONE_DIRECTIONAL_FRICTION and eENABLE_TWO_DIRECTIONAL_FRICTION.</li>
+					<li>Introduced a new scene query flag PxSceneQueryFlag::eINITIAL_OVERLAP_KEEP to control how initial overhaps are treated in scene queries.</li>
+					<li>Per-triangle materials have been implemented.</li>
+					<li>Changes to material properties are automatically reflected in contact resolution.</li>
+					<li>New helper methods to compute mass properties for a dynamic rigid body taking per shape density/mass values into account (see documentation on PxRigidBodyExt for details).</li>
+					<li>A new set of methods for overlap, sweep and raycast tests based on PxGeometry objects has been introduced. See documentation on PxMeshQuery and PxGeometryQuery for details).</li>
+					<li>
+						The contact report API has changed (for details see the documentation on PxSimulationEventCallback::onContact()). Among the changes are:
+						<ul>
+							<li>Reports only get sent for shape pairs which request them. Previously, reports were sent for an actor pair even if the requested shape pair event was not triggered (for example because other shapes of the same actors started colliding etc.)</li>
+							<li>The following PxPairFlags have been removed eNOTIFY_CONTACT_FORCES, eNOTIFY_CONTACT_FORCE_PER_POINT, eNOTIFY_CONTACT_FEATURE_INDICES_PER_POINT. Forces and feature indices are now always provided if applicable.</li>
+							<li>It is much easier now to skip shape pairs or contact point information when traversing the contact report data.</li>
+							<li>The memory footprint of contact reports has been reduced.</li>
+						</ul>
+					</li>
+					<li>The members featureIndex0/1 of PxContactPoint have been renamed to internalFaceIndex0/1 for consistency.</li>
+					<li>For trigger reports, the eNOTIFY_TOUCH_PERSISTS event has been deprecated and will be removed in the next release. For performance and flexibility reasons it is recommended to use eNOTIFY_TOUCH_FOUND and eNOTIFY_TOUCH_LOST only and manage the persistent state separately.</li>
+					<li>Added PxConstraintVisualizer interface and code to visualize joint frames and limits.</li>
+					<li>Improved PxBatchQuery API.</li>
+					<li>PxPhysics::getProfileZoneManager() now returns a pointer rather than a reference.</li>
+					<li>PxRigidDynamic::moveKinematic() has been renamed to setKinematicTarget() to underline its precise semantics.</li>
+					<li>Added new function PxShape::getGeometry and class PxGeometryHolder to improve Geometry APIs.</li>
+					<li>PxCreatePlane now takes a PxPlane equation as a parameter. Note that the interpretation of the distance value is negated relative to 3.1</li>
+					<li>Added new actor creation helpers PxCloneStatic, PxCloneDynamic, PxScaleActor.</li>
+					<li>Added new functions PxTransformFromSegment, PxTransformFromPlaneEquation to simplify creation of planes and capsules.</li>
+					<li>added PxJoint::getConstraint() to access the underlying constraint object, from which the constraint force can be read</li>
+					<li>
+						Some methods of PxAggregate have been renamed for consistency or replaced for extended functionality.
+						<ul>
+							<li>getMaxSize() is now called getMaxNbActors().</li>
+							<li>getCurrentSize() is now called getNbActors().</li>
+							<li>getActor() has been replaced by getActors() which copies the actor pointers to a user buffer.</li>
+						</ul>
+					</li>
+					<li>Added support for kinematic triangle meshes, planes and heighfields.</li>
+				</ul>
 
+				<h4>Scene queries</h4>
+				<ul>
+					<li>Dynamic AABBTree has been set as the default dynamic pruning structure.</li>
+				</ul>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+				<h4>Particles</h4>
+				<ul>
+					<li>Removed descriptors from particle API: The properties maxParticles and PxParticleBaseFlag::ePER_PARTICLE_REST_OFFSET need to be specified when calling PxPhysics::createParticleSystem() and createParticleFluid(). All other properties can be adjusted after creation through set methods.</li>
+				</ul>
 
+				<h4>Cloth</h4>
+				<ul>
+					<li>Added convex collision shapes, see PxCloth::addCollisionConvex()</li>
+					<li>Added friction support, see PxCloth::setFrictionCoefficient()</li>
+					<li>Added angle based bending constraints, see PxClothPhaseSolverConfig::SolverType::eBENDING</li>
+					<li>Added separation constraints, a spherical volume that particles should stay outside of, see PxCloth::setSeparationConstraints()</li>
+					<li>Added drag, see PxCloth::setDragCoefficient()</li>
+					<li>Added inertia scaling, controls how much movement due to PxCloth::setTargetPose() will affect the cloth</li>
+					<li>Added support for setting particle previous positions, see PxCloth::setParticles()</li>
+					<li>Added controls for scaling particle mass during collision, this can help reduce edge stretching around joints on characters, see PxCloth::setCollisionMassScale()</li>
+					<li>Particle data is now copied asynchronously from the GPU after simulate (rather than on demand)</li>
+					<li>Improved fabric layout, you can now share fabric data across multiple phases to reduce memory usage, see PxClothFabric</li>
+					<li>Fixed bug in collision when capsules are tapered at a slope > 1</li>
+				</ul>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.2</h1>
+				<h4>Vehicles</h4>
+				<ul>
+					<li>Added PxVehicleDriveTank, a vehicle class that enables tank behaviors.</li>
+					<li>Support for vehicles with more than 4 wheels, see PxVehicleDrive4W, PxVehicleDriveTank.</li>
+					<li>Significant refactor of vehicle api to allow further types of vehicle to be added.</li>
+					<li>Americal English spelling used in vehicle api.</li>
+					<li>PxVehicle4W replaced with PxVehicleDrive4W, PxVehicle4WSimulationData replaced with PxVehicleDriveSimData4W.</li>
+					<li>Removal of scene query helper functions and structs: PxVehicle4WSceneQueryData, PxVehicle4WSetUpSceneQuery, PxWheelRaycastPreFilter, PxSetupDrivableShapeQueryFilterData, PxSetupNonDrivableShapeQueryFilterData, PxSetupVehicleShapeQueryFilterData. See SampleVehicle_SceneQuery.h for their implementation in SampleVehicle.</li>
+					<li>PxVehicle4WSimpleSetup and PxCreateVehicle4WSimulationData have been removed and replaced with default values in vehicle components, see PxVehicleComponents.h.</li>
+					<li>PxVehicle4WTelemetryData has been replaced with PxVehicleTelemetryData, a class that supports vehicles with any number of wheels, see PxVehicleTelemetryData</li>
+					<li>PxVehicleDrivableSurfaceType no longer stored in PxMaterial::userData.  A hash table of PxMaterial pointers is instead used to associate each PxMaterial with a PxVehicleDrivableSurfaceType, see PxVehicleDrivableSurfaceToTireFrictionPairs.</li>
+					<li>PxVehicleTyreData::mLongitudinalStiffness has been replaced with PxVehicleTireData::mLongitudinalStiffnessPerUnitGravity, see PxVehicleTireData.</li>
+					<li>Tire forces now computed from a shader to allow user-specified tire force functions, see PxVehicleTireForceCalculator.</li>
+					<li>Added helper functions to quickly configure 3-wheeled cars, see PxVehicle4WEnable3WTadpoleMode, PxVehicle4WEnable3WDeltaMode.</li>
+				</ul>
 
-<h2 style="TEXT-ALIGN: center">December 2011</h2>
+				<h4>Serialization</h4>
+				<ul>
+					<li>Changed the functions PxPhysics::releaseUserReferences(), releaseCollection(), addCollection() and releaseCollected() to now take a reference rather than a pointer.</li>
+					<li>Platform conversion for serialized data has been moved from the ConvX command line tool to the PxBinaryConverter interface in the cooking library.</li>
+					<li>Changed some functions in RepXUtility.h and RepX.h to take a reference rather than a pointer.</li>
+				</ul>
 
-<h2>What's New In NVIDIA PhysX 3.2</h2>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li>Three new sample applications:  SampleCharacterCloth (character with cloth cape and cloth flags), SampleBridges (character controller walking on dynamic bridges and moving platforms), SampleCustomGravity (character controller with arbitrary up vector).</li>
-		<li>On Windows, the PxFoundation instance is now a process wide singleton and part of the new PhysX3Common.dll library</li>
-		<li>PxCreatePhysics() does not create a PxFoundation instance any longer. The PxFoundation instance has to be created in advance through PxCreateFoundation().</li>
-		<li>Calls to PxCreatePhysics() are not valid anymore if a PxPhysics instance already exists.</li>
-		<li>If profiling information should be sent to the PhysX Visual Debugger, a PxProfileZoneManager instance has to be provided when creating the PxPhysics instance.</li>
-		<li>The version number constant PX_PUBLIC_FOUNDATION_VERSION has been replaced with PX_PHYSICS_VERSION. Both PxFoundation and PxPhysics use the same version number now.</li>
-		<li>The API now distinguishes between input and output stream types.</li>
-		<li>Added mechanism to reduce code size by not linking optional components.  See PxCreateBasePhysics() and the PxRegister*() functions.</li>
-		<li>Added getConcreteTypeName() to API classes to provide run time type information.</li>
-		<li>Added PxScene::getTimestamp() to retrieve the simulation counter.</li>
-		<li>PxGetFoundation has been moved to PxGetFoundation.h</li>
-		<li>Changed the functions PxPhysics::releaseUserReferences(), releaseCollection(), addCollection() and releaseCollected() to now take a reference rather than a pointer.</li>
-		<li>The signature of PxCreatePhysics has changed:  The Foundation SDK instance must be passed in explicitly.  One can also hook profiling information by passing a PxProfileZoneManager.</li>
-		<li>Platform conversion for serialized data has been moved from the ConvX command line tool to the PxBinaryConverter interface in the cooking library</li>
-		<li>contact data block allocation now provides statistics on usage and max usage</li>
-		<li>On all platforms except PS3, contact data blocks can be progressively allocated</li>
-		<li>PxExtensionVisualDebugger has been renamed to PxVisualDebuggerExt, PxExtensionsConnectionType renamed to PxVisualDebuggerConnectionFlag</li>
-		<li>Default implementations of memory and file streams added in PxDefaultStreams.h</li>
-		<li>Renamed PxPhysics::getMetaData() to ::PxGetSDKMetaData().</li>
-		<li>Scene::simulate() now takes a memory block which is used for allocation of temporary data during simulation</li>
-		<li>On Windows, CudaContextManagerDesc support appGUID now. It only works on release build. If your application employs PhysX modules that use CUDA you need to use a GUID so that patches for new architectures can be released for your game.You can obtain a GUID for your application from Nvidia.</li>
-	</ul>
+				<h4>What we removed:</h4>
+				<ul>
+					<li>Deformables have been removed. Use the optimized solution for clothing simulation instead (see documentation on PxCloth for details).</li>
+					<li>PxSweepCache was replaced with PxVolumeCache.</li>
+					<li>PVD is no longer enabled in the release build.</li>
+					<li>Removed anisotropic friction.</li>
+					<li>Removed the CCD mode eSWEPT_CONTACT_PAIRS.</li>
+					<li>PxActorDesc has been removed.</li>
+					<li>The ConvX tool has been removed.</li>
+					<li>Removed empty default implementations of functions in PxSimulationEventCallback for consistency and because it can create bugs in user code if function prototypes change between releases.  Users must now supply (eventually blank) implementations for all functions.</li>
+					<li>Octree and quadtree pruning structures have been removed.</li>
+				</ul>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>Introduced a new contact generation mode, see eENABLE_PCM.  Note that this is an experimental feature that still shows simulation artifacts in some scenarios.</li>
-		<li>Introduced two new friction simulation modes, see eENABLE_ONE_DIRECTIONAL_FRICTION and eENABLE_TWO_DIRECTIONAL_FRICTION.</li>
-		<li>Introduced a new scene query flag PxSceneQueryFlag::eINITIAL_OVERLAP_KEEP to control how initial overhaps are treated in scene queries.</li>
-		<li>Per-triangle materials have been implemented.</li>
-		<li>Changes to material properties are automatically reflected in contact resolution.</li>
-		<li>New helper methods to compute mass properties for a dynamic rigid body taking per shape density/mass values into account (see documentation on PxRigidBodyExt for details).</li>
-		<li>A new set of methods for overlap, sweep and raycast tests based on PxGeometry objects has been introduced. See documentation on PxMeshQuery and PxGeometryQuery for details).</li>
-		<li>The contact report API has changed (for details see the documentation on PxSimulationEventCallback::onContact()). Among the changes are:
-		<ul>
-			<li>Reports only get sent for shape pairs which request them. Previously, reports were sent for an actor pair even if the requested shape pair event was not triggered (for example because other shapes of the same actors started colliding etc.)</li>
-			<li>The following PxPairFlags have been removed eNOTIFY_CONTACT_FORCES, eNOTIFY_CONTACT_FORCE_PER_POINT, eNOTIFY_CONTACT_FEATURE_INDICES_PER_POINT. Forces and feature indices are now always provided if applicable.</li>
-			<li>It is much easier now to skip shape pairs or contact point information when traversing the contact report data.</li>
-			<li>The memory footprint of contact reports has been reduced.</li>
-		</ul>
-		</li>
-		<li>The members featureIndex0/1 of PxContactPoint have been renamed to internalFaceIndex0/1 for consistency.</li>
-		<li>For trigger reports, the eNOTIFY_TOUCH_PERSISTS event has been deprecated and will be removed in the next release. For performance and flexibility reasons it is recommended to use eNOTIFY_TOUCH_FOUND and eNOTIFY_TOUCH_LOST only and manage the persistent state separately.</li>
-		<li>Added PxConstraintVisualizer interface and code to visualize joint frames and limits.</li>
-		<li>Improved PxBatchQuery API.</li>
-		<li>PxPhysics::getProfileZoneManager() now returns a pointer rather than a reference.</li>
-		<li>PxRigidDynamic::moveKinematic() has been renamed to setKinematicTarget() to underline its precise semantics.</li>
-		<li>Added new function PxShape::getGeometry and class PxGeometryHolder to improve Geometry APIs.</li>
-		<li>PxCreatePlane now takes a PxPlane equation as a parameter. Note that the interpretation of the distance value is negated relative to 3.1</li>
-		<li>Added new actor creation helpers PxCloneStatic, PxCloneDynamic, PxScaleActor.</li>
-		<li>Added new functions PxTransformFromSegment, PxTransformFromPlaneEquation to simplify creation of planes and capsules.</li>
-		<li>added PxJoint::getConstraint() to access the underlying constraint object, from which the constraint force can be read</li>
-		<li>Some methods of PxAggregate have been renamed for consistency or replaced for extended functionality.
-		<ul>
-			<li>getMaxSize() is now called getMaxNbActors().</li>
-			<li>getCurrentSize() is now called getNbActors().</li>
-			<li>getActor() has been replaced by getActors() which copies the actor pointers to a user buffer.</li>
-		</ul>
-		</li>
-		<li>Added support for kinematic triangle meshes, planes and heighfields.</li>
-	</ul>
+				<h4>Fixed Bugs</h4>
+				<ul>
+					<li>PxScene::getActors() might not work properly when the startIndex parameter is used.</li>
+					<li>Improved the doc-comment of PxConvexMesh::getMassInformation().</li>
+					<li>RepX instantiation can lose all internal references when addOriginalIdsToObjectMap is false.</li>
+					<li>PxSetGroup crashed when used on a compound.</li>
+					<li>PhysXCommon.dll can be delay loaded.</li>
+					<li>ContactReportStream can now handle huge report numbers and size (resize-able flag) can be set in PxSceneDesc.h.</li>
+					<li>Fixed assert in sweep tests.</li>
+					<li>Concurrent read/write operations during a PxScene::fetchResults() call were not detected properly and no warning message got sent in checked builds. Forbidden write operations during callbacks triggered by PxScene::fetchResults() (contact/trigger reports etc.) were not covered either.</li>
+					<li>Fixed crash bug that occurred during collision detection when more than 16K of contact data was generated.  Contacts that generate more than 16K of contact data are now fully supported.</li>
+					<li>Fixed crash bug when PhysX Visual Debugger is connected and an object gets modified and then released while the simulation is running.</li>
+				</ul>
 
-	<h4>Scene queries</h4>
-	<ul>
-		<li>Dynamic AABBTree has been set as the default dynamic pruning structure.</li>
-	</ul>
+			</blockquote>
 
-	<h4>Particles</h4>
-	<ul>
-		<li>Removed descriptors from particle API: The properties maxParticles and PxParticleBaseFlag::ePER_PARTICLE_REST_OFFSET need to be specified when calling PxPhysics::createParticleSystem() and createParticleFluid(). All other properties can be adjusted after creation through set methods.</li>
-	</ul>
+			<h2><br><a name="platforms320">Supported Platforms</a></h2>
 
-	<h4>Cloth</h4>
-	<ul>
-		<li>Added convex collision shapes, see PxCloth::addCollisionConvex()</li>
-		<li>Added friction support, see PxCloth::setFrictionCoefficient()</li>
-		<li>Added angle based bending constraints, see PxClothPhaseSolverConfig::SolverType::eBENDING</li>
-		<li>Added separation constraints, a spherical volume that particles should stay outside of, see PxCloth::setSeparationConstraints()</li>
-		<li>Added drag, see PxCloth::setDragCoefficient()</li>
-		<li>Added inertia scaling, controls how much movement due to PxCloth::setTargetPose() will affect the cloth</li>
-		<li>Added support for setting particle previous positions, see PxCloth::setParticles()</li>
-		<li>Added controls for scaling particle mass during collision, this can help reduce edge stretching around joints on characters, see PxCloth::setCollisionMassScale()</li>
-		<li>Particle data is now copied asynchronously from the GPU after simulate (rather than on demand)</li>
-		<li>Improved fabric layout, you can now share fabric data across multiple phases to reduce memory usage, see PxClothFabric</li>
-		<li>Fixed bug in collision when capsules are tapered at a slope > 1</li>
-	</ul>
+			<blockquote>
+				<h4>Runtime</h4>
+				<ul>
+					<li>Apple iOS</li>
+					<li>Apple Mac OS X</li>
+					<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
+					<li>Linux (tested on Ubuntu)</li>
+					<li>Microsoft Windows XP or later (NVIDIA Driver ver 295.73 or later is required for GPU acceleration)</li>
+					<li>Microsoft XBox 360</li>
+					<li>Sony Playstation 3</li>
+					<li>Sony Playstation Vita</li>
+				</ul>
+				<h4>Development</h4>
+				<ul>
+					<li>Microsoft Windows XP or later</li>
+					<li>Microsoft Visual Studio 2008, 2010</li>
+					<li>Xcode 4.2</li>
+				</ul>
+			</blockquote>
 
-	<h4>Vehicles</h4>
-	<ul>
-		<li>Added PxVehicleDriveTank, a vehicle class that enables tank behaviors.</li>
-		<li>Support for vehicles with more than 4 wheels, see PxVehicleDrive4W, PxVehicleDriveTank.</li>
-		<li>Significant refactor of vehicle api to allow further types of vehicle to be added.</li>
-		<li>Americal English spelling used in vehicle api.</li>
-		<li>PxVehicle4W replaced with PxVehicleDrive4W, PxVehicle4WSimulationData replaced with PxVehicleDriveSimData4W.</li>
-		<li>Removal of scene query helper functions and structs: PxVehicle4WSceneQueryData, PxVehicle4WSetUpSceneQuery, PxWheelRaycastPreFilter, PxSetupDrivableShapeQueryFilterData, PxSetupNonDrivableShapeQueryFilterData, PxSetupVehicleShapeQueryFilterData. See SampleVehicle_SceneQuery.h for their implementation in SampleVehicle.</li>
-		<li>PxVehicle4WSimpleSetup and PxCreateVehicle4WSimulationData have been removed and replaced with default values in vehicle components, see PxVehicleComponents.h.</li>
-		<li>PxVehicle4WTelemetryData has been replaced with PxVehicleTelemetryData, a class that supports vehicles with any number of wheels, see PxVehicleTelemetryData</li>
-		<li>PxVehicleDrivableSurfaceType no longer stored in PxMaterial::userData.  A hash table of PxMaterial pointers is instead used to associate each PxMaterial with a PxVehicleDrivableSurfaceType, see PxVehicleDrivableSurfaceToTireFrictionPairs.</li>
-		<li>PxVehicleTyreData::mLongitudinalStiffness has been replaced with PxVehicleTireData::mLongitudinalStiffnessPerUnitGravity, see PxVehicleTireData.</li>
-		<li>Tire forces now computed from a shader to allow user-specified tire force functions, see PxVehicleTireForceCalculator.</li>
-		<li>Added helper functions to quickly configure 3-wheeled cars, see PxVehicle4WEnable3WTadpoleMode, PxVehicle4WEnable3WDeltaMode.</li>
-	</ul>
+			<h2><br><a name="knownissues320">Known Issues And Limitations</a></h2>
 
-    <h4>Serialization</h4>
-    <ul>
-        <li>Changed the functions PxPhysics::releaseUserReferences(), releaseCollection(), addCollection() and releaseCollected() to now take a reference rather than a pointer.</li>
-		<li>Platform conversion for serialized data has been moved from the ConvX command line tool to the PxBinaryConverter interface in the cooking library.</li>
-		<li>Changed some functions in RepXUtility.h and RepX.h to take a reference rather than a pointer.</li>
-    </ul>
+			<blockquote>
 
+				<h4>Binary Serialization</h4>
+				<ul>
+					<li>Meta data generation for PxParticleFluid and PxParticleSystem serializables currently fails.</li>
+					<li>For collections that contain jointed rigid bodies all corresponding joints need to be added as well, otherwise deserialization will fail.
+				</ul>
+				<h4>Rigid Body Simulation</h4>
+				<ul>
+					<li>Capsules and spheres can struggle to come to rest on even perfectly flat surfaces. To ensure these objects come to rest, it is necessary to increase angular damping on rigid bodies made of these shapes. In addition, flagging the capsule/sphere's material with physx::PxMaterialFlag::eDISABLE_STRONG_FRICTION can help bring these shapes to rest.</li>
+				</ul>
+				<h4>Character Cloth Sample</h4>
+				<ul>
+					<li>An NVIDIA GPU with Compute Capability 2.0 or higher is required for GPU accelerated simulation in the SampleCharacterCloth sample, if no such device is present then simulation will be performed on the CPU.</li>
+					<li>Note that this is not a general SDK requirement, the clothing SDK supports cards with Compute Capability < 2.0 but with limitations on mesh size.</li>
+				</ul>
+				<h4>Character Controller</h4>
+				<ul>
+					<li>Releasing shapes of actors that are in touch with a character controller may lead to crashes. Releasing whole actors doesn't lead to the same problems. PxController::invalidateCache() can be used to work around these issues.</li>
+				</ul>
 
-	<h4>What we removed:</h4>
-	<ul>
-		<li>Deformables have been removed. Use the optimized solution for clothing simulation instead (see documentation on PxCloth for details).</li>
-		<li>PxSweepCache was replaced with PxVolumeCache.</li>
-		<li>PVD is no longer enabled in the release build.</li>
-		<li>Removed anisotropic friction.</li>
-		<li>Removed the CCD mode eSWEPT_CONTACT_PAIRS.</li>
-		<li>PxActorDesc has been removed.</li>
-		<li>The ConvX tool has been removed.</li>
-		<li>Removed empty default implementations of functions in PxSimulationEventCallback for consistency and because it can create bugs in user code if function prototypes change between releases.  Users must now supply (eventually blank) implementations for all functions.</li>
-		<li>Octree and quadtree pruning structures have been removed.</li>
-	</ul>
+			</blockquote>
 
-	<h4>Fixed Bugs</h4>
-	<ul>
-		<li>PxScene::getActors() might not work properly when the startIndex parameter is used.</li>
-		<li>Improved the doc-comment of PxConvexMesh::getMassInformation().</li>
-        <li>RepX instantiation can lose all internal references when addOriginalIdsToObjectMap is false.</li>
-        <li>PxSetGroup crashed when used on a compound.</li>
-        <li>PhysXCommon.dll can be delay loaded.</li>
-        <li>ContactReportStream can now handle huge report numbers and size (resize-able flag) can be set in PxSceneDesc.h.</li>
-        <li>Fixed assert in sweep tests.</li>
-		<li>Concurrent read/write operations during a PxScene::fetchResults() call were not detected properly and no warning message got sent in checked builds. Forbidden write operations during callbacks triggered by PxScene::fetchResults() (contact/trigger reports etc.) were not covered either.</li>
-        <li>Fixed crash bug that occurred during collision detection when more than 16K of contact data was generated.  Contacts that generate more than 16K of contact data are now fully supported.</li>
-		<li>Fixed crash bug when PhysX Visual Debugger is connected and an object gets modified and then released while the simulation is running.</li>
-	</ul>
+			<blockquote>
+				Please also see the previous lists <a href="#knownissues311"> from 3.1.1</a> and earlier.
+			</blockquote>
 
-</blockquote>
-
-<h2><br><a name="platforms320">Supported Platforms</a></h2>
-
-<blockquote>
-    <h4>Runtime</h4>
-    <ul>
-		<li>Apple iOS</li>
-		<li>Apple Mac OS X</li>
-		<li>Google Android (version 2.2 or later for SDK, 2.3 or later required for samples)</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Microsoft Windows XP or later (NVIDIA Driver ver 295.73 or later is required for GPU acceleration)</li>
-        <li>Microsoft XBox 360</li>
-        <li>Sony Playstation 3</li>
-		<li>Sony Playstation Vita</li>
-    </ul>
-    <h4>Development</h4>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010</li>
-		<li>Xcode 4.2</li>
-    </ul>
-</blockquote>
-
-<h2><br>
-<a name="knownissues320">Known Issues And Limitations</a></h2>
-
-<blockquote>
-
-	<h4>Binary Serialization</h4>
-	<ul>
-	    <li>Meta data generation for PxParticleFluid and PxParticleSystem serializables currently fails.</li>
-		<li>For collections that contain jointed rigid bodies all corresponding joints need to be added as well, otherwise deserialization will fail.
-	</ul>
-	<h4>Rigid Body Simulation</h4>
-	<ul>
-	    <li>Capsules and spheres can struggle to come to rest on even perfectly flat surfaces. To ensure these objects come to rest, it is necessary to increase angular damping on rigid bodies made of these shapes. In addition, flagging the capsule/sphere's material with physx::PxMaterialFlag::eDISABLE_STRONG_FRICTION can help bring these shapes to rest.</li>
-	</ul>
-	<h4>Character Cloth Sample</h4>
-	<ul>
-	    <li>An NVIDIA GPU with Compute Capability 2.0 or higher is required for GPU accelerated simulation in the SampleCharacterCloth sample, if no such device is present then simulation will be performed on the CPU.</li>
-		<li>Note that this is not a general SDK requirement, the clothing SDK supports cards with Compute Capability < 2.0 but with limitations on mesh size.</li>
-	</ul>
-	<h4>Character Controller</h4>
-	<ul>
-	    <li>Releasing shapes of actors that are in touch with a character controller may lead to crashes. Releasing whole actors doesn't lead to the same problems. PxController::invalidateCache() can be used to work around these issues.</li>
-	</ul>
 
-</blockquote>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<blockquote>
-	Please also see the previous lists <a href="#knownissues311"> from 3.1.1</a> and earlier.
-</blockquote>
 
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.1.2</h1>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<h2 style="TEXT-ALIGN: center">December 2011</h2>
 
+			<h2>What's New In NVIDIA PhysX 3.1.2</h2>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.1.2</h1>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li>Fixed wrong write/read clash checks.</li>
+					<li>Removed some compiler warnings from public header files.</li>
+					<li>Fixed PxScene::getActors() returning wrong actors when a start index is specified.</li>
+				</ul>
 
-<h2 style="TEXT-ALIGN: center">December 2011</h2>
 
-<h2>What's New In NVIDIA PhysX 3.1.2</h2>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Fixed broken joint projection in connection with kinematics.</li>
+					<li>Fixed inaccurate normals returned from height field scene queries.</li>
+					<li>Fixed a crash when the geometry of a shape changes and then the actor gets removed from the scene while the simulation is running.</li>
+					<li>Fixed a crash when re-adding scene-query shape actors to scene.</li>
+				</ul>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li>Fixed wrong write/read clash checks.</li>
-		<li>Removed some compiler warnings from public header files.</li>
-		<li>Fixed PxScene::getActors() returning wrong actors when a start index is specified.</li>
-	</ul>
 
+				<h4>Particles</h4>
+				<ul>
+					<li>Fixed crash bug in particle simulation code on GPU.</li>
+				</ul>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>Fixed broken joint projection in connection with kinematics.</li>
-		<li>Fixed inaccurate normals returned from height field scene queries.</li>
-		<li>Fixed a crash when the geometry of a shape changes and then the actor gets removed from the scene while the simulation is running.</li>
-		<li>Fixed a crash when re-adding scene-query shape actors to scene.</li>
-	</ul>
 
+				<h4>Cloth</h4>
+				<ul>
+					<li>Fixed a crash when GPU fabrics are shared between cloths.</li>
+					<li>Fixed a hang in cloth fiber cooker when handed non-manifold geometry.</li>
+				</ul>
 
-	<h4>Particles</h4>
-	<ul>
-		<li>Fixed crash bug in particle simulation code on GPU.</li>
-	</ul>
+				<h4>Samples</h4>
+				<ul>
+					<li>Fixed SampleVehicles doing an invalid write.</li>
+					<li>Fixed SampleVehicle jitter in profile build.</li>
+				</ul>
 
+			</blockquote>
 
-	<h4>Cloth</h4>
-	<ul>
-		<li>Fixed a crash when GPU fabrics are shared between cloths.</li>
-		<li>Fixed a hang in cloth fiber cooker when handed non-manifold geometry.</li>
-	</ul>
+			<h2><br>Supported Platforms (available in separate packages)</h2>
 
-	<h4>Samples</h4>
-	<ul>
-		<li>Fixed SampleVehicles doing an invalid write.</li>
-		<li>Fixed SampleVehicle jitter in profile build.</li>
-	</ul>
+			<blockquote>
+				Unchanged from <a href="#platforms311"> from 3.1.1</a>.
+			</blockquote>
 
-</blockquote>
+			<h2><br>Known Issues And Limitations</h2>
 
-<h2><br>Supported Platforms (available in separate packages)</h2>
+			<blockquote>
+				Unchanged from <a href="#knownissues310"> from 3.1</a>.
+			</blockquote>
 
-<blockquote>
-	Unchanged from <a href="#platforms311"> from 3.1.1</a>.
-</blockquote>
 
-<h2><br>Known Issues And Limitations</h2>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<blockquote>
-	Unchanged from <a href="#knownissues310"> from 3.1</a>.
-</blockquote>
 
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.1.1</h1>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<h2 style="TEXT-ALIGN: center">November 2011</h2>
 
+			<h2>What's New In NVIDIA PhysX 3.1.1</h2>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.1.1</h1>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li>Ported samples to Linux.</li>
+					<li>Fixed crash bug in ConvX.</li>
+					<li>Fixed crash bug in the allocator code of PXC_NP_MEM_BLOCK_EXTENSIBLE.</li>
+					<li>Fixed crash bug when connected to PVD on various platforms.</li>
+					<li>Fixed bogus asserts due to overly strict validation of quaternions.</li>
+					<li>Fixed one frame lag in PVD scene statistics.</li>
+					<li>Fixed a number of OSX PVD sockets issues.</li>
+					<li>Fixed SampleSubmarine code that violated concurrent read/writes restriction.</li>
+					<li>Added warnings about read/write hazards to the checked build.</li>
+					<li>Fixed RepX not reading joint properties.</li>
+					<li>Fixed support for concurrent scene queries.</li>
+					<li>Fixed PhysX GPU Visual Indicator support.</li>
+					<li>Made it more clear in documentation that simulate(0) is not allowed.</li>
+				</ul>
 
-<h2 style="TEXT-ALIGN: center">November 2011</h2>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>eNOTIFY_TOUCH_LOST trigger events do now get reported if one of the objects in contact gets deleted (see documentation of PxTriggerPair for details).</li>
+					<li>Dynamic rigid bodies with trigger shapes only do not wake up other touching bodies anymore.</li>
+					<li>Added lost touch events for trigger reports when objects get deleted.</li>
+					<li>Fixed dynamic triggers waking up actors they are triggered by.</li>
+					<li>Removed an inapropriate assert from articulation code.</li>
+					<li>Fixed problem with the angular momentum conservation of articulations.</li>
+					<li>Fixed articulation sleep problems.</li>
+					<li>Fixed a linear velocity related bug in CCD.</li>
+					<li>Fixed crash bug CCD.</li>
+					<li>Optimized performance of joint information being sent to PVD.</li>
+				</ul>
 
-<h2>What's New In NVIDIA PhysX 3.1.1</h2>
+			</blockquote>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li>Ported samples to Linux.</li>
-		<li>Fixed crash bug in ConvX.</li>
-		<li>Fixed crash bug in the allocator code of PXC_NP_MEM_BLOCK_EXTENSIBLE.</li>
-		<li>Fixed crash bug when connected to PVD on various platforms.</li>
-		<li>Fixed bogus asserts due to overly strict validation of quaternions.</li>
-		<li>Fixed one frame lag in PVD scene statistics.</li>
-		<li>Fixed a number of OSX PVD sockets issues.</li>
-		<li>Fixed SampleSubmarine code that violated concurrent read/writes restriction.</li>
-		<li>Added warnings about read/write hazards to the checked build.</li>
-		<li>Fixed RepX not reading joint properties.</li>
-		<li>Fixed support for concurrent scene queries.</li>
-		<li>Fixed PhysX GPU Visual Indicator support.</li>
-		<li>Made it more clear in documentation that simulate(0) is not allowed.</li>
-	</ul>
+			<h2><br><a name="platforms311">Supported Platforms (available in separate packages)</a></h2>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>eNOTIFY_TOUCH_LOST trigger events do now get reported if one of the objects in contact gets deleted (see documentation of PxTriggerPair for details).</li>
-		<li>Dynamic rigid bodies with trigger shapes only do not wake up other touching bodies anymore.</li>
-		<li>Added lost touch events for trigger reports when objects get deleted.</li>
-		<li>Fixed dynamic triggers waking up actors they are triggered by.</li>
-		<li>Removed an inapropriate assert from articulation code.</li>
-		<li>Fixed problem with the angular momentum conservation of articulations.</li>
-		<li>Fixed articulation sleep problems.</li>
-		<li>Fixed a linear velocity related bug in CCD.</li>
-		<li>Fixed crash bug CCD.</li>
-		<li>Optimized performance of joint information being sent to PVD.</li>
-	</ul>
+			<blockquote>
+				<h4>Runtime</h4>
+				<ul>
+					<li>Microsoft Windows XP or later</li>
+					<li>Microsoft XBox 360</li>
+					<li>Sony Playstation 3</li>
+					<li>Android 2.2 or later for SDK, 2.3 or later required for samples</li>
+					<li>Linux (tested on Ubuntu)</li>
+					<li>Mac OS X</li>
+				</ul>
+				<h4>Development</h4>
+				<ul>
+					<li>Microsoft Windows XP or later</li>
+					<li>Microsoft Visual Studio 2008, 2010</li>
+					<li>Xcode 3</li>
+				</ul>
+			</blockquote>
 
-</blockquote>
+			<h2><br><a name="knownissues311">Known Issues And Limitations</a></h2>
 
-<h2><br><a name="platforms311">Supported Platforms (available in separate packages)</a></h2>
+			<blockquote>
+				Unchanged from <a href="#knownissues310"> from 3.1</a>.
+			</blockquote>
 
-<blockquote>
-    <h4>Runtime</h4>
-    <ul>
-		<li>Microsoft Windows XP or later</li>
-        <li>Microsoft XBox 360</li>
-        <li>Sony Playstation 3</li>
-		<li>Android 2.2 or later for SDK, 2.3 or later required for samples</li>
-		<li>Linux (tested on Ubuntu)</li>
-		<li>Mac OS X</li>
-    </ul>
-    <h4>Development</h4>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010</li>
-		<li>Xcode 3</li>
-    </ul>
-</blockquote>
 
-<h2><br><a name="knownissues311">Known Issues And Limitations</a></h2>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<blockquote>
-	Unchanged from <a href="#knownissues310"> from 3.1</a>.
-</blockquote>
 
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.1</h1>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<h2 style="TEXT-ALIGN: center">September 2011</h2>
 
+			<h2>What's New In NVIDIA PhysX 3.1</h2>
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.1</h1>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li>VC10 support has been introduced.</li>
+					<li>VC8 support has been discontinued.</li>
+					<li>Namespaces cleaned up.</li>
+					<li>Extensions, Character Controller and Vehicle source code made available in binary distribution.</li>
+					<li>Added x86,x64 suffix to PxTaskCUDA.dll</li>
+					<li>Removed boolean return value from PxScene::addActor(...), and similar API calls.</li>
+					<li>Added MacOS, Android and Linux to the list of supported platforms.  See Supported Platforms below for details.</li>
+					<li>Upgraded GPU tech to CUDA 4.</li>
+					<li>Cleaned up a large number of warnings at C++ warning level 4, and set SDK to compile with warnings as errors.</li>
+					<li>Removed individual sample executables in favor of one Samples executable from PC and console builds.</li>
+					<li>Fixed alpha blending in samples.</li>
+					<li>Simplified some code in samples.</li>
+					<li>Improved ambient lighting in samples.</li>
+					<li>Made samples work with older graphics cards.</li>
+					<li>Improved and added more content the user's guide.</li>
+					<li>No longer passing NULL pointers to user allocator to deallocate.</li>
+					<li>Various improvements to Foundation and classes shared with APEX.</li>
+				</ul>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Rigid Body: High performance alternative convex narrow phase code available to source licensees. See PERSISTENT_CONTACT_MANIFOLD in the code.</li>
+					<li>Significant advancements in the continuous collision detection algorithm.</li>
+					<li>Optimizations and robustness improvements for articulations.</li>
+					<li>Added some helper code to the API.</li>
+					<li>Added sleep code for articulations.</li>
+					<li>Added support for vehicles with more than one chassis shape.</li>
+					<li>Solver iteration count for articulations.</li>
+					<li>Articulation limit padding configurable.</li>
+					<li>The reference count of meshes does now take the application's reference into acount as well and thus has increased by 1 (it used to count the number of objects referencing the mesh only). Note that a mesh does only get destroyed and removed from the list of meshes once the reference count reaches 0.</li>
+					<li>Fixed autowake parameter sometimes being ignored.</li>
+					<li>Constraint solver optimizations.</li>
+					<li>Improved behavior of character controller on steep slopes.</li>
+					<li>Binary serialization now saves names.</li>
+					<li>Removed some descriptors from API.</li>
+					<li>Removed the angular velocity term in the joint positional drive error formula.</li>
+					<li>Fixed bug in capsule sweep versus mesh.</li>
+					<li>Fixed a crash bug in the tire model.</li>
+					<li>Fixed crashing of single link articulations.</li>
+					<li>Fixed bug related to removing elements of an aggregate.</li>
+					<li>Fixed swapped wheel graphs in sample vehicle.</li>
+					<li>Fixed some slow moving bodies falling asleep in midair.</li>
+					<li>Fixed missing collisions after a call to resetFiltering.</li>
+					<li>Fixed broken autowake option in setAngularVelocity.</li>
+					<li>Fixed D6 joint linear limits being uninitialized.</li>
+					<li>A large number of misc. bug fixes and optimizations.</li>
+					<li>Improved documentation and error messages associated with running out of narrow phase buffer blocks.</li>
+					<li>Added articulation documentation.</li>
+					<li>Expanded manual sections on joints.</li>
+					<li>Improved reference doc for PxSceneQueryHitType.</li>
+					<li>Added reference doc for PxSerializable.</li>
+				</ul>
 
-<h2 style="TEXT-ALIGN: center">September 2011</h2>
+				<h4>Particles</h4>
+				<ul>
+					<li>Particle index allocation removed from SDK. Added index allocation pool to extensions.</li>
+					<li>Replaced GPU specific side band API PxPhysicsGpu and PxPhysics::getPhysicsGpu() with PxParticleGpu.</li>
+					<li>
+						Memory optimizations on all platforms and options to reduce memory usage according to use case with new per particle system flags:
+						<ul>
+							<li>PxParticleBaseFlag::eCOLLISION_WITH_DYNAMIC_ACTORS</li>
+							<li>PxParticleBaseFlag::ePER_PARTICLE_COLLISION_CACHE_HINT</li>
+						</ul>
+					</li>
+					<li>Fixed rare crash appearing with multi-threaded non-GPU particle systems and rigid bodies.</li>
+					<li>Fixed particles leaking through triangle mesh geometry on GPU.</li>
+					<li>Fixed fast particles tunneling through scene geometry in some cases.</li>
+					<li>Fixed erroneous collision of particles with teleporting rigid shapes (setGlobalPose).</li>
+					<li>Fixed particle sample behavior with some older GPU models.</li>
+					<li>Fixed a GPU particle crash bug.</li>
+				</ul>
 
-<h2>What's New In NVIDIA PhysX 3.1</h2>
 
-<blockquote>
-	<h4>General</h4>
-	<ul>
-		<li>VC10 support has been introduced.</li>
-		<li>VC8 support has been discontinued.</li>
-		<li>Namespaces cleaned up.</li>
-		<li>Extensions, Character Controller and Vehicle source code made available in binary distribution.</li>
-		<li>Added x86,x64 suffix to PxTaskCUDA.dll</li>
-		<li>Removed boolean return value from PxScene::addActor(...), and similar API calls.</li>
-		<li>Added MacOS, Android and Linux to the list of supported platforms.  See Supported Platforms below for details.</li>
-		<li>Upgraded GPU tech to CUDA 4.</li>
-		<li>Cleaned up a large number of warnings at C++ warning level 4, and set SDK to compile with warnings as errors.</li>
-		<li>Removed individual sample executables in favor of one Samples executable from PC and console builds.</li>
-		<li>Fixed alpha blending in samples.</li>
-		<li>Simplified some code in samples.</li>
-		<li>Improved ambient lighting in samples.</li>
-		<li>Made samples work with older graphics cards.</li>
-		<li>Improved and added more content the user's guide.</li>
-		<li>No longer passing NULL pointers to user allocator to deallocate.</li>
-		<li>Various improvements to Foundation and classes shared with APEX.</li>
-	</ul>
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>Rigid Body: High performance alternative convex narrow phase code available to source licensees. See PERSISTENT_CONTACT_MANIFOLD in the code.</li>
-		<li>Significant advancements in the continuous collision detection algorithm.</li>
-		<li>Optimizations and robustness improvements for articulations.</li>
-		<li>Added some helper code to the API.</li>
-		<li>Added sleep code for articulations.</li>
-		<li>Added support for vehicles with more than one chassis shape.</li>
-		<li>Solver iteration count for articulations.</li>
-		<li>Articulation limit padding configurable.</li>
-		<li>The reference count of meshes does now take the application's reference into acount as well and thus has increased by 1 (it used to count the number of objects referencing the mesh only). Note that a mesh does only get destroyed and removed from the list of meshes once the reference count reaches 0.</li>
-		<li>Fixed autowake parameter sometimes being ignored.</li>
-		<li>Constraint solver optimizations.</li>
-		<li>Improved behavior of character controller on steep slopes.</li>
-		<li>Binary serialization now saves names.</li>
-		<li>Removed some descriptors from API.</li>
-		<li>Removed the angular velocity term in the joint positional drive error formula.</li>
-		<li>Fixed bug in capsule sweep versus mesh.</li>
-		<li>Fixed a crash bug in the tire model.</li>
-		<li>Fixed crashing of single link articulations.</li>
-		<li>Fixed bug related to removing elements of an aggregate.</li>
-		<li>Fixed swapped wheel graphs in sample vehicle.</li>
-		<li>Fixed some slow moving bodies falling asleep in midair.</li>
-		<li>Fixed missing collisions after a call to resetFiltering.</li>
-		<li>Fixed broken autowake option in setAngularVelocity.</li>
-		<li>Fixed D6 joint linear limits being uninitialized.</li>
-		<li>A large number of misc. bug fixes and optimizations.</li>
-		<li>Improved documentation and error messages associated with running out of narrow phase buffer blocks.</li>
-		<li>Added articulation documentation.</li>
-		<li>Expanded manual sections on joints.</li>
-		<li>Improved reference doc for PxSceneQueryHitType.</li>
-		<li>Added reference doc for PxSerializable.</li>
-	</ul>
+				<h4>Cloth</h4>
+				<ul>
+					<li>A new solution for simulating cloth and clothing.</li>
+				</ul>
 
-	<h4>Particles</h4>
-	<ul>
-	   <li>Particle index allocation removed from SDK. Added index allocation pool to extensions.</li>
-	   <li>Replaced GPU specific side band API PxPhysicsGpu and PxPhysics::getPhysicsGpu() with PxParticleGpu.</li>
-	   <li>Memory optimizations on all platforms and options to reduce memory usage according to use case with new per particle system flags:
-	   <ul>
-		   <li>PxParticleBaseFlag::eCOLLISION_WITH_DYNAMIC_ACTORS</li>
-		   <li>PxParticleBaseFlag::ePER_PARTICLE_COLLISION_CACHE_HINT</li>
-	   </ul></li>
-	   <li>Fixed rare crash appearing with multi-threaded non-GPU particle systems and rigid bodies.</li>
-	   <li>Fixed particles leaking through triangle mesh geometry on GPU.</li>
-	   <li>Fixed fast particles tunneling through scene geometry in some cases.</li>
-	   <li>Fixed erroneous collision of particles with teleporting rigid shapes (setGlobalPose).</li>
-	   <li>Fixed particle sample behavior with some older GPU models.</li>
-		<li>Fixed a GPU particle crash bug.</li>
-	</ul>
 
+				<h4>Deformables</h4>
+				<ul>
+					<li>Deformables are deprecated and will be removed in the next release. There is a new optimized solution for clothing simulation (see documentation on PxCloth for details).</li>
+				</ul>
+			</blockquote>
 
-	<h4>Cloth</h4>
-	<ul>
-		<li>A new solution for simulating cloth and clothing.</li>
-	</ul>
+			<h2><br>Supported Platforms (available in separate packages)</h2>
 
+			<blockquote>
+				<h4>Runtime</h4>
+				<ul>
+					<li>Microsoft Windows XP or later</li>
+					<li>Microsoft XBox 360</li>
+					<li>Sony Playstation 3</li>
+					<li>Android 2.2 or later for SDK, 2.3 or later required for samples</li>
+					<li>Linux (SDK tested on Ubuntu, samples not yet ported) </li>
+					<li>Mac OS X</li>
+				</ul>
+				<h4>Development</h4>
+				<ul>
+					<li>Microsoft Windows XP or later</li>
+					<li>Microsoft Visual Studio 2008, 2010</li>
+					<li>Xcode 3</li>
+				</ul>
+			</blockquote>
 
-	<h4>Deformables</h4>
-	<ul>
-		<li>Deformables are deprecated and will be removed in the next release. There is a new optimized solution for clothing simulation (see documentation on PxCloth for details).</li>
-	</ul>
-</blockquote>
-
-<h2><br>Supported Platforms (available in separate packages)</h2>
-
-<blockquote>
-    <h4>Runtime</h4>
-    <ul>
-		<li>Microsoft Windows XP or later</li>
-        <li>Microsoft XBox 360</li>
-        <li>Sony Playstation 3</li>
-		<li>Android 2.2 or later for SDK, 2.3 or later required for samples</li>
-		<li>Linux (SDK tested on Ubuntu, samples not yet ported) </li>
-		<li>Mac OS X</li>
-    </ul>
-    <h4>Development</h4>
-    <ul>
-        <li>Microsoft Windows XP or later</li>
-        <li>Microsoft Visual Studio 2008, 2010</li>
-		<li>Xcode 3</li>
-    </ul>
-</blockquote>
-
-<h2><br>
-<a name="knownissues310">Known Issues And Limitations</a></h2>
-
-<blockquote>
-	<h4>General</h4>
-	<ul>
-	    <li>Under VC10, you may get warnings due to conflicting build configuration flags.  Workaround: Clear the "Inherit from parent or project defaults" flag for all projects in Project->Properties->C/C++->Command Line.  We plan to fix this for the 3.1 general availability release.</li>
-	</ul>
-	<h4>Scene Query</h4>
-	<ul>
-	    <li>Querying the scene (e.g. using raycastSingle()) from multiple threads simultaneously is not safe.</li>
-	</ul>
-	<h4>Cloth</h4>
-	<ul>
-	    <li>Even simple parameters of a PxCloth can not be set or accessed while the simulation is running.</li>
-	</ul>
-	<h4>RepX</h4>
-	<ul>
-	    <li>RepX fails to load elements of aggregate joint parameters (PxD6JointDrive etc.)</li>
-	</ul>
-</blockquote>
+			<h2><br><a name="knownissues310">Known Issues And Limitations</a></h2>
 
-<blockquote>
-	Please also see the previous lists <a href="#knownissues300"> from 3.0</a>.
-</blockquote>
+			<blockquote>
+				<h4>General</h4>
+				<ul>
+					<li>Under VC10, you may get warnings due to conflicting build configuration flags.  Workaround: Clear the "Inherit from parent or project defaults" flag for all projects in Project->Properties->C/C++->Command Line.  We plan to fix this for the 3.1 general availability release.</li>
+				</ul>
+				<h4>Scene Query</h4>
+				<ul>
+					<li>Querying the scene (e.g. using raycastSingle()) from multiple threads simultaneously is not safe.</li>
+				</ul>
+				<h4>Cloth</h4>
+				<ul>
+					<li>Even simple parameters of a PxCloth can not be set or accessed while the simulation is running.</li>
+				</ul>
+				<h4>RepX</h4>
+				<ul>
+					<li>RepX fails to load elements of aggregate joint parameters (PxD6JointDrive etc.)</li>
+				</ul>
+			</blockquote>
 
+			<blockquote>
+				Please also see the previous lists <a href="#knownissues300"> from 3.0</a>.
+			</blockquote>
 
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
-<br>
-<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
+			<br>
+			<!-- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -->
 
-<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.0</h1>
 
-<h2 style="TEXT-ALIGN: center">February 14<sup>th</sup>2011</h2>
+			<h1 style="TEXT-ALIGN: center">Release Notes - NVIDIA<sup>&reg;</sup> PhysX<sup>&reg;</sup> SDK 3.0</h1>
 
-<h2>What's New In NVIDIA PhysX 3.0</h2>
+			<h2 style="TEXT-ALIGN: center">February 14<sup>th</sup>2011</h2>
 
-<blockquote>
-	<h4>General</h4>
+			<h2>What's New In NVIDIA PhysX 3.0</h2>
 
-	This third major version of the SDK is a significant rewrite of the entire technology.  We did away with a large amount of legacy clutter and replaced them with a wealth of new features and improvements.
-	Because even the API changes are so significant, it is easier to see it as a whole new product rather than a long list of changes.
+			<blockquote>
+				<h4>General</h4>
 
-	<h4>What we removed:</h4>
-	<ul>
-		<li>The dedicated NVIDIA PhysX PPU hardware is not supported any more.</li>
-		<li>Scene compartments are not supported anymore. All created physical objects are now part of one and the same compartment.</li>
-		<li>Force fields are not part of the NVIDIA PhysX SDK anymore.</li>
-		<li>Splitting a simulation timestep into multiple substeps is not a functionality of the NVIDIA PhysX SDK any longer and has to be implemented above the SDK.</li>
-	</ul>
+				This third major version of the SDK is a significant rewrite of the entire technology.  We did away with a large amount of legacy clutter and replaced them with a wealth of new features and improvements.
+				Because even the API changes are so significant, it is easier to see it as a whole new product rather than a long list of changes.
 
-	<h4>Key new features:</h4>
-	<ul>
-		<li>Articulations: A way to create very stiff joint assemblies.</li>
-		<li>Serialization: Save objects in a binary format and load them back quickly!</li>
-		<li>Broad Phase Clustering: Put objects that belong together into a single broadphase volume.</li>
-		<li>Driverless Model: No need to worry about system software on PC anymore.</li>
-		<li>Dynamic Character Controller: A character controller that can robustly walk on dynamic objects.</li>
-		<li>Vehicle Library: A toolkit to make vehicles, including an all new tire model.</li>
-		<li>Non-Simulation Objects: A staging are outside of the simulation from where you can add things into the simulation at high speed.</li>
-		<li>Simulation Task Manager: Take control of the management of simulation tasks.</li>
-		<li>Stable Depenetration: Large penetrations can be gracefully recovered from.</li>
-		<li>Double Buffering: You can read from and write to the simulated objects while the simulation is running on another thread.</li>
-		<li>Mesh Scaling: Create different nonuniformly scaled instances of your meshes and convexes without duplicating the memory.</li>
-		<li>Distance Based Collision Detection: Have the simulation create contacts before objects touch, and do away with unsightly overlaps.</li>
-		<li>Fast Continuous Collision Detection: Have small and high speed objects collide correctly without sacrificing performance.</li>
-		<li>Significantly increased performance and memory footprint, especially on consoles.</li>
-		<li>Unified solver for deformables and rigid bodies for great interaction.</li>
-		<li>Triangle collision detection with deformables.</li>
-		<li>Support for our new Physics Visual Debugger, including integrated profiling.</li>
-	</ul>
+				<h4>What we removed:</h4>
+				<ul>
+					<li>The dedicated NVIDIA PhysX PPU hardware is not supported any more.</li>
+					<li>Scene compartments are not supported anymore. All created physical objects are now part of one and the same compartment.</li>
+					<li>Force fields are not part of the NVIDIA PhysX SDK anymore.</li>
+					<li>Splitting a simulation timestep into multiple substeps is not a functionality of the NVIDIA PhysX SDK any longer and has to be implemented above the SDK.</li>
+				</ul>
 
+				<h4>Key new features:</h4>
+				<ul>
+					<li>Articulations: A way to create very stiff joint assemblies.</li>
+					<li>Serialization: Save objects in a binary format and load them back quickly!</li>
+					<li>Broad Phase Clustering: Put objects that belong together into a single broadphase volume.</li>
+					<li>Driverless Model: No need to worry about system software on PC anymore.</li>
+					<li>Dynamic Character Controller: A character controller that can robustly walk on dynamic objects.</li>
+					<li>Vehicle Library: A toolkit to make vehicles, including an all new tire model.</li>
+					<li>Non-Simulation Objects: A staging are outside of the simulation from where you can add things into the simulation at high speed.</li>
+					<li>Simulation Task Manager: Take control of the management of simulation tasks.</li>
+					<li>Stable Depenetration: Large penetrations can be gracefully recovered from.</li>
+					<li>Double Buffering: You can read from and write to the simulated objects while the simulation is running on another thread.</li>
+					<li>Mesh Scaling: Create different nonuniformly scaled instances of your meshes and convexes without duplicating the memory.</li>
+					<li>Distance Based Collision Detection: Have the simulation create contacts before objects touch, and do away with unsightly overlaps.</li>
+					<li>Fast Continuous Collision Detection: Have small and high speed objects collide correctly without sacrificing performance.</li>
+					<li>Significantly increased performance and memory footprint, especially on consoles.</li>
+					<li>Unified solver for deformables and rigid bodies for great interaction.</li>
+					<li>Triangle collision detection with deformables.</li>
+					<li>Support for our new Physics Visual Debugger, including integrated profiling.</li>
+				</ul>
 
-	<h4>Math classes</h4>
-	<ul>
-		<li>Matrix based transforms have been replaced by quaternions.</li>
-		<li>All angles are now expressed in radians. IN PARTICULAR the PxQuat constructor from
-			axis and angle as well as the getAngleAxis and fromAngleAxis methods now use radians rather than degrees.</li>
-	</ul>
 
-	<h4>Rigid Bodies</h4>
-	<ul>
-		<li>Capsules are now defined to extend along the x rather than the y axis.</li>
-		<li>Triangle meshes do not support heightfield functionality anymore. Use the dedicated PxHeightField class instead.</li>
-		<li>Dynamic triangle mesh actors are not supported any longer. However, you can decompose your mesh into convex parts and create a dynamic actor consisting of these convex parts.</li>
-		<li>The deprecated heightfield property NxHeightFieldDesc::verticalExtent is not supported any longer. Please use the PxHeightFieldDesc::thickness parameter instead.</li>
-		<li>NxSpringAndDamperEffector is not supported anymore. Use PxDistanceJoint instead.</li>
-		<li>Joints are now part of the PhysX extensions library (PhysXExtensions).</li>
-		<li>Wheel shapes have been replaced by the more flexible entity PxWheel. A default wheel implementation, encapsulating most of the old wheel functionality, can be found in the PhysX extensions library (see PxDefaultWheel).</li>
-		<li>The NxUtilLib library has been removed. Sweep/overlap/raycast tests and other helper methods can be found in the new GeomUtils library.</li>
-		<li>Materials can no longer be accessed through indices. Per triangle material meshes need a material table which can be specified per shape (see PxShape::setMaterials() for details).</li>
-		<li>The default material is not available anymore.</li>
-	</ul>
+				<h4>Math classes</h4>
+				<ul>
+					<li>Matrix based transforms have been replaced by quaternions.</li>
+					<li>
+						All angles are now expressed in radians. IN PARTICULAR the PxQuat constructor from
+						axis and angle as well as the getAngleAxis and fromAngleAxis methods now use radians rather than degrees.
+					</li>
+				</ul>
 
-	<h4>Particle Systems, Particle Fluids</h4>
-	<ul>
-		<li>The NxFluid class has been replaced with two classes for separation of functionality and ease of use.
-			<ul>
-				<li>PxParticleSystem: Particles colliding against the scene.</li>
-				<li>PxParticleFluid: Particles simulating a fluid (sph).</li>
-			</ul>
-		</li>
-		<li>Simplified parameterization for particle systems.
-			<ul>
-				<li>Using absolute distances instead of relative multipliers to rest spacing</li>
-				<li>Simplified sph parameters</li>
-				<li>Unified collision parameters with deformable and rigid body features</li>
-			</ul>
-		</li>
-		<li>Creating and releasing particles is now fully controlled by the application.
-			<ul>
-				<li>Particle lifetime management isn't provided through the SDK anymore.</li>
-				<li>Emitters have been removed from the SDK.</li>
-				<li>Drain shapes don't cause particles to be deleted directly, but to be flagged instead.</li>
-				<li>Initial particle creation from the particle system descriptor isn't supported anymore.</li>
-			</ul>
-		</li>
-		<li>Particle data buffer handling has been moved to the SDK.</li>
-		<li>Per particle collision rest offset.</li>
-		<li>GPU accelerated particle systems.
-			<ul>
-				<li>Application controllable mesh mirroring to device memory.</li>
-				<li>Runtime switching between software and GPU accelerated implementation.</li>
-			</ul>
-		</li>
-	</ul>
-</blockquote>
-
-<h2><br>Supported Platforms (available in separate packages)</h2>
-
-<blockquote>
-    <h4>Runtime</h4>
-    <ul>
-		<li>Microsoft Windows XP or and later</li>
-        <li>Microsoft XBox360</li>
-        <li>Sony Playstation 3</li>
-    </ul>
-    <h4>Development</h4>
-    <ul>
-        <li>Microsoft Windows XP or and later</li>
-        <li>Microsoft Visual Studio 2008</li>
-    </ul>
-</blockquote>
-
-<h2><br>
-<a name="knownissues300">Known Issues And Limitations</a></h2>
-
-<blockquote>
-	<h4>Rigid Bodies</h4>
-	<ul>
-	    <li>Adding or removing a PxAggregate object to the scene is not possible while the simulation is running.</li>
-	</ul>
-	<h4>Particle Systems</h4>
-	<ul>
-	    <li>Releasing the Physics SDK may result in a crash when using GPU accelerated particle systems. </br>
-		This can be avoided by doing the following before releasing the Physics SDK:
-			<ul>
-				<li>Releasing the PxScene objects that contain the GPU accelerated particle systems.</li>
-				<li>Releasing application mirrored meshes by calling PxPhysicsGpu::releaseTriangleMeshMirror(...), PxPhysicsGpu::releaseHeightFieldMirror(...) or PxPhysicsGpu::releaseConvexMeshMirror(...).</li>
-			</ul>
-		</li>
-	</ul>
-</blockquote>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Capsules are now defined to extend along the x rather than the y axis.</li>
+					<li>Triangle meshes do not support heightfield functionality anymore. Use the dedicated PxHeightField class instead.</li>
+					<li>Dynamic triangle mesh actors are not supported any longer. However, you can decompose your mesh into convex parts and create a dynamic actor consisting of these convex parts.</li>
+					<li>The deprecated heightfield property NxHeightFieldDesc::verticalExtent is not supported any longer. Please use the PxHeightFieldDesc::thickness parameter instead.</li>
+					<li>NxSpringAndDamperEffector is not supported anymore. Use PxDistanceJoint instead.</li>
+					<li>Joints are now part of the PhysX extensions library (PhysXExtensions).</li>
+					<li>Wheel shapes have been replaced by the more flexible entity PxWheel. A default wheel implementation, encapsulating most of the old wheel functionality, can be found in the PhysX extensions library (see PxDefaultWheel).</li>
+					<li>The NxUtilLib library has been removed. Sweep/overlap/raycast tests and other helper methods can be found in the new GeomUtils library.</li>
+					<li>Materials can no longer be accessed through indices. Per triangle material meshes need a material table which can be specified per shape (see PxShape::setMaterials() for details).</li>
+					<li>The default material is not available anymore.</li>
+				</ul>
 
+				<h4>Particle Systems, Particle Fluids</h4>
+				<ul>
+					<li>
+						The NxFluid class has been replaced with two classes for separation of functionality and ease of use.
+						<ul>
+							<li>PxParticleSystem: Particles colliding against the scene.</li>
+							<li>PxParticleFluid: Particles simulating a fluid (sph).</li>
+						</ul>
+					</li>
+					<li>
+						Simplified parameterization for particle systems.
+						<ul>
+							<li>Using absolute distances instead of relative multipliers to rest spacing</li>
+							<li>Simplified sph parameters</li>
+							<li>Unified collision parameters with deformable and rigid body features</li>
+						</ul>
+					</li>
+					<li>
+						Creating and releasing particles is now fully controlled by the application.
+						<ul>
+							<li>Particle lifetime management isn't provided through the SDK anymore.</li>
+							<li>Emitters have been removed from the SDK.</li>
+							<li>Drain shapes don't cause particles to be deleted directly, but to be flagged instead.</li>
+							<li>Initial particle creation from the particle system descriptor isn't supported anymore.</li>
+						</ul>
+					</li>
+					<li>Particle data buffer handling has been moved to the SDK.</li>
+					<li>Per particle collision rest offset.</li>
+					<li>
+						GPU accelerated particle systems.
+						<ul>
+							<li>Application controllable mesh mirroring to device memory.</li>
+							<li>Runtime switching between software and GPU accelerated implementation.</li>
+						</ul>
+					</li>
+				</ul>
+			</blockquote>
 
+			<h2><br>Supported Platforms (available in separate packages)</h2>
 
+			<blockquote>
+				<h4>Runtime</h4>
+				<ul>
+					<li>Microsoft Windows XP or and later</li>
+					<li>Microsoft XBox360</li>
+					<li>Sony Playstation 3</li>
+				</ul>
+				<h4>Development</h4>
+				<ul>
+					<li>Microsoft Windows XP or and later</li>
+					<li>Microsoft Visual Studio 2008</li>
+				</ul>
+			</blockquote>
 
+			<h2><br><a name="knownissues300">Known Issues And Limitations</a></h2>
 
+			<blockquote>
+				<h4>Rigid Bodies</h4>
+				<ul>
+					<li>Adding or removing a PxAggregate object to the scene is not possible while the simulation is running.</li>
+				</ul>
+				<h4>Particle Systems</h4>
+				<ul>
+					<li>
+						Releasing the Physics SDK may result in a crash when using GPU accelerated particle systems. </br>
+						This can be avoided by doing the following before releasing the Physics SDK:
+						<ul>
+							<li>Releasing the PxScene objects that contain the GPU accelerated particle systems.</li>
+							<li>Releasing application mirrored meshes by calling PxPhysicsGpu::releaseTriangleMeshMirror(...), PxPhysicsGpu::releaseHeightFieldMirror(...) or PxPhysicsGpu::releaseConvexMeshMirror(...).</li>
+						</ul>
+					</li>
+				</ul>
+			</blockquote>
 
-<p><br>
-Copyright (C) 2008-2018 NVIDIA Corporation, 2701 San Thomas Expressway, Santa Clara, CA 95050 U.S.A. All rights reserved. <A href="http://www.nvidia.com">www.nvidia.com</A>
-</p>
+			<p>
+				<br>
+				Copyright (C) 2008-2018 NVIDIA Corporation, 2701 San Thomas Expressway, Santa Clara, CA 95050 U.S.A. All rights reserved. <A href="http://www.nvidia.com">www.nvidia.com</A>
+			</p>
 </body>
 </html>
diff --git a/physx/samples/pxtoolkit/include/PxTkRandom.h b/physx/samples/pxtoolkit/include/PxTkRandom.h
index 652db894e..ce083dffe 100644
--- a/physx/samples/pxtoolkit/include/PxTkRandom.h
+++ b/physx/samples/pxtoolkit/include/PxTkRandom.h
@@ -50,17 +50,6 @@ namespace PxToolkit
 		PX_FORCE_INLINE	PxU32		rand()						{ return randomize() & 0xffff;							}
 		PX_FORCE_INLINE	PxU32		rand32()					{ return randomize() & 0xffffffff;						}
 
-						PxF32		randLegacy(PxF32 a, PxF32 b)
-									{
-										const PxF32 r = static_cast<PxF32>(rand())/(static_cast<PxF32>(0x7fff)+1.0f);
-										return r*(b-a) + a;
-									}
-
-						PxI32		randLegacy(PxI32 a, PxI32 b)
-									{
-										return a + static_cast<PxI32>(rand()%(b-a));
-									}
-
 						PxF32		rand(PxF32 a, PxF32 b)
 									{
 										const PxF32 r = rand32()/(static_cast<PxF32>(0xffffffff));
diff --git a/physx/samples/sampleframework/platform/include/windows/WindowsSamplePlatform.h b/physx/samples/sampleframework/platform/include/windows/WindowsSamplePlatform.h
index 74de1280b..95b48535d 100644
--- a/physx/samples/sampleframework/platform/include/windows/WindowsSamplePlatform.h
+++ b/physx/samples/sampleframework/platform/include/windows/WindowsSamplePlatform.h
@@ -132,6 +132,7 @@ namespace SampleFramework
 		physx::PxVec2						getMouseCursorPos() const { return m_mouseCursorPos; }
 		void								setMouseCursorPos(const physx::PxVec2& pos) { m_mouseCursorPos = pos; }
 		void								recenterMouseCursor(bool generateEvent);
+		void								setWorkaroundMouseMoved() { m_workaroundMouseMoved = true;  }
 	
 	protected:	
 		IDirect3D9*							m_d3d;
@@ -154,6 +155,7 @@ namespace SampleFramework
 		char								m_platformName[256];
 		WindowsSampleUserInput				m_windowsSampleUserInput;
 		physx::PxVec2						m_mouseCursorPos;
+		bool								m_workaroundMouseMoved;
 		bool								m_recenterMouseCursor;
 		bool								m_vsync;
 	};
diff --git a/physx/samples/sampleframework/platform/src/windows/WindowsSamplePlatform.cpp b/physx/samples/sampleframework/platform/src/windows/WindowsSamplePlatform.cpp
index c5512d8a6..0b6b6a338 100644
--- a/physx/samples/sampleframework/platform/src/windows/WindowsSamplePlatform.cpp
+++ b/physx/samples/sampleframework/platform/src/windows/WindowsSamplePlatform.cpp
@@ -119,7 +119,7 @@ static void handleMouseEvent(UINT msg, LPARAM lParam, HWND hwnd, RendererWindow*
 	WindowsPlatform& platform = *((WindowsPlatform*)window->getPlatform());
 	PxVec2 diff = current - platform.getMouseCursorPos();
 	platform.setMouseCursorPos(current);
-
+	
 	switch (msg)
 	{
 	case WM_MOUSEMOVE:
@@ -127,6 +127,11 @@ static void handleMouseEvent(UINT msg, LPARAM lParam, HWND hwnd, RendererWindow*
 		{
 			platform.getWindowsSampleUserInput().doOnMouseMove(x, y, diff.x, diff.y, MOUSE_MOVE);
 		}
+		else
+		{
+			//needed in WindowsPlatform::recenterMouseCursor to filter out erroneous cursor difference, even though mouse is not moving.
+			((WindowsPlatform*)window->getPlatform())->setWorkaroundMouseMoved();
+		}
 		break;
 	case WM_LBUTTONDOWN:
 		SetCapture(hwnd);
@@ -428,6 +433,7 @@ m_vsync(false)
 
 	m_mouseCursorPos = PxVec2(0);
 	m_recenterMouseCursor = false;
+	m_workaroundMouseMoved = false;
 }
 
 WindowsPlatform::~WindowsPlatform()
@@ -542,7 +548,7 @@ void WindowsPlatform::recenterMouseCursor(bool generateEvent)
 
 		POINT winPos;
 		GetCursorPos(&winPos);
-		PxVec2 current(static_cast<PxReal>(winPos.x), winHeight - static_cast<PxReal>(winPos.y));
+		PxVec2 currentCoords(static_cast<PxReal>(winPos.x), winHeight - static_cast<PxReal>(winPos.y));
 
 		PxI32 winCenterX = 0;
 		PxI32 winCenterY = 0;
@@ -551,14 +557,18 @@ void WindowsPlatform::recenterMouseCursor(bool generateEvent)
 			winCenterY = rect.top + PxI32(winHeight>>1);
 		}
 		SetCursorPos(winCenterX, winCenterY);
+		PxVec2 centerCoords(static_cast<PxReal>(winCenterX), winHeight - static_cast<PxReal>(winCenterY));
+		PxVec2 diff = currentCoords - centerCoords;
 
-		if (generateEvent)
+		//sometimes GetCursorPos [frame N+1] and SetCursorPos [frame N] don't agree by a few pixels, even if the mouse is not moving.
+		//the windows event queue however knows that the mouse is not moving -> m_workaroundMouseMoved
+		if (generateEvent && m_workaroundMouseMoved)
 		{
-			PxVec2 diff = current - PxVec2(static_cast<PxReal>(winCenterX), winHeight - static_cast<PxReal>(winCenterY));
 			getWindowsSampleUserInput().doOnMouseMove(winCenterX, winHeight - winCenterY, diff.x, diff.y, MOUSE_MOVE);
 		}
 	
-		m_mouseCursorPos = current;
+		m_mouseCursorPos = currentCoords;
+		m_workaroundMouseMoved = false;
 	}
 }
 
diff --git a/physx/source/immediatemode/src/NpImmediateMode.cpp b/physx/source/immediatemode/src/NpImmediateMode.cpp
index 7b0cb0860..36383ccd1 100644
--- a/physx/source/immediatemode/src/NpImmediateMode.cpp
+++ b/physx/source/immediatemode/src/NpImmediateMode.cpp
@@ -234,14 +234,14 @@ namespace physx
 			const PxRigidBodyData& rigidData = inRigidData[a];
 			PxVec3 lv = rigidData.linearVelocity, av = rigidData.angularVelocity;
 			Dy::bodyCoreComputeUnconstrainedVelocity(gravity, dt, rigidData.linearDamping, rigidData.angularDamping, 1.f, rigidData.maxLinearVelocitySq, rigidData.maxAngularVelocitySq, lv, av, false);
-			Dy::copyToSolverBodyData(lv, av, rigidData.invMass, rigidData.invInertia, rigidData.body2World, -rigidData.maxDepenetrationVelocity, rigidData.maxContactImpulse, IG_INVALID_NODE, PX_MAX_F32, outSolverBodyData[a], 0, false);
+			Dy::copyToSolverBodyData(lv, av, rigidData.invMass, rigidData.invInertia, rigidData.body2World, -rigidData.maxDepenetrationVelocity, rigidData.maxContactImpulse, IG_INVALID_NODE, PX_MAX_F32, outSolverBodyData[a], 0);
 		}
 	}
 
 	void immediate::PxConstructStaticSolverBody(const PxTransform& globalPose, PxSolverBodyData& solverBodyData)
 	{
 		PxVec3 zero(0.f);
-		Dy::copyToSolverBodyData(zero, zero, 0.f, zero, globalPose, -PX_MAX_F32, PX_MAX_F32, IG_INVALID_NODE, PX_MAX_F32, solverBodyData, 0, false);
+		Dy::copyToSolverBodyData(zero, zero, 0.f, zero, globalPose, -PX_MAX_F32, PX_MAX_F32, IG_INVALID_NODE, PX_MAX_F32, solverBodyData, 0);
 	}
 
 
diff --git a/physx/source/lowlevel/common/include/pipeline/PxcRigidBody.h b/physx/source/lowlevel/common/include/pipeline/PxcRigidBody.h
index 86730507d..2f757d5d0 100644
--- a/physx/source/lowlevel/common/include/pipeline/PxcRigidBody.h
+++ b/physx/source/lowlevel/common/include/pipeline/PxcRigidBody.h
@@ -61,7 +61,6 @@ class PxcRigidBody
 		eDEACTIVATE_THIS_FRAME =		1 << 4,
 		eDISABLE_GRAVITY =				1 << 5,
 		eSPECULATIVE_CCD =				1 << 6,
-		eHAS_SURFACE_VELOCITY =			1 << 7,
 		//KS - copied here for GPU simulation to avoid needing to pass another set of flags around.
 		eLOCK_LINEAR_X =				1 << (PX_INTERNAL_LOCK_FLAG_START),
 		eLOCK_LINEAR_Y =				1 << (PX_INTERNAL_LOCK_FLAG_START + 1),
diff --git a/physx/source/lowleveldynamics/include/DyVArticulation.h b/physx/source/lowleveldynamics/include/DyVArticulation.h
index dede99944..ac81ccd85 100644
--- a/physx/source/lowleveldynamics/include/DyVArticulation.h
+++ b/physx/source/lowleveldynamics/include/DyVArticulation.h
@@ -124,12 +124,7 @@ namespace physx
 				swingLimitContactDistance = 0.05f;
 				twistLimitContactDistance = 0.05f;
 
-				//ML: the old articulation system, I am not sure I will break anything if
-				//I set the driveType default to be NONE but driveType to be eNONE is required
-				//by the new articulation system.
-				//driveType = PxArticulationJointDriveType::eTARGET;
-
-				driveType = PxArticulationJointDriveType::eNONE;
+				driveType = PxArticulationJointDriveType::eTARGET;
 
 				jointType = PxArticulationJointType::eFIX;
 
@@ -154,7 +149,7 @@ namespace physx
 				childPose = childFrame;
 
 				//the old articulation is eTARGET
-				driveType = PxArticulationJointDriveType::eNONE;
+				driveType = PxArticulationJointDriveType::eTARGET;
 
 				spring = 0.0f;
 				damping = 0.0f;
diff --git a/physx/source/lowleveldynamics/src/DyContactPrep.cpp b/physx/source/lowleveldynamics/src/DyContactPrep.cpp
index 6126ce229..1332c435d 100644
--- a/physx/source/lowleveldynamics/src/DyContactPrep.cpp
+++ b/physx/source/lowleveldynamics/src/DyContactPrep.cpp
@@ -578,7 +578,7 @@ bool createFinalizeSolverContacts(
 		return true;
 	}
 
-	if (!disableStrongFriction && !contactDesc.data0->hasSurfaceVelocity && !contactDesc.data1->hasSurfaceVelocity)
+	if (!disableStrongFriction)
 	{
 		getFrictionPatches(c, contactDesc.frictionPtr, contactDesc.frictionCount, contactDesc.bodyFrame0, contactDesc.bodyFrame1, correlationDistance);
 	}
diff --git a/physx/source/lowleveldynamics/src/DyContactPrep4.cpp b/physx/source/lowleveldynamics/src/DyContactPrep4.cpp
index 6062fad08..a5641b7e7 100644
--- a/physx/source/lowleveldynamics/src/DyContactPrep4.cpp
+++ b/physx/source/lowleveldynamics/src/DyContactPrep4.cpp
@@ -1304,7 +1304,7 @@ SolverConstraintPrepState::Enum createFinalizeSolverContacts4(
 		invInertiaScale1[a] = blockDesc.mInvMassScales.angular1;
 
 		blockDesc.startFrictionPatchIndex = c.frictionPatchCount;
-		if (!(blockDesc.disableStrongFriction && !blockDesc.data0->hasSurfaceVelocity && !blockDesc.data1->hasSurfaceVelocity))
+		if (!(blockDesc.disableStrongFriction))
 		{
 			bool valid = getFrictionPatches(c, blockDesc.frictionPtr, blockDesc.frictionCount,
 				blockDesc.bodyFrame0, blockDesc.bodyFrame1, correlationDistance);
diff --git a/physx/source/lowleveldynamics/src/DyDynamics.cpp b/physx/source/lowleveldynamics/src/DyDynamics.cpp
index 95e4724d9..26fe049b1 100644
--- a/physx/source/lowleveldynamics/src/DyDynamics.cpp
+++ b/physx/source/lowleveldynamics/src/DyDynamics.cpp
@@ -206,7 +206,6 @@ DynamicsContext::DynamicsContext(	PxcNpMemBlockPool* memBlockPool,
 	mWorldSolverBodyData.linearVelocity = mWorldSolverBodyData.angularVelocity = PxVec3(0.f);
 	mWorldSolverBodyData.body2World = PxTransform(PxIdentity);
 	mWorldSolverBodyData.lockFlags = 0;
-	mWorldSolverBodyData.hasSurfaceVelocity = 0;
 	mSolverCore[PxFrictionType::ePATCH] = SolverCoreGeneral::create(frictionEveryIteration);
 	mSolverCore[PxFrictionType::eONE_DIRECTIONAL] = SolverCoreGeneralPF::create();
 	mSolverCore[PxFrictionType::eTWO_DIRECTIONAL] = SolverCoreGeneralPF::create();
@@ -2215,7 +2214,7 @@ class KinematicCopyTask : public Cm::Task
 			PxsRigidBody* rigidBody = mIslandSim.getRigidBody(mKinematicIndices[i]);
 			const PxsBodyCore& core = rigidBody->getCore();
 			copyToSolverBodyData(core.linearVelocity, core.angularVelocity, core.inverseMass, core.inverseInertia, core.body2World, core.maxPenBias,
-				core.maxContactImpulse, mKinematicIndices[i].index(), core.contactReportThreshold, mBodyData[i + 1], core.lockFlags, !!(rigidBody->mInternalFlags & PxsRigidBody::eHAS_SURFACE_VELOCITY));
+				core.maxContactImpulse, mKinematicIndices[i].index(), core.contactReportThreshold, mBodyData[i + 1], core.lockFlags);
 			rigidBody->saveLastCCDTransform();
 		}
 	}
@@ -2565,7 +2564,7 @@ static void preIntegrationParallel(
 			core.linearVelocity, core.angularVelocity, !!(rBody.mInternalFlags & PxcRigidBody::eDISABLE_GRAVITY));
 
 		copyToSolverBodyData(core.linearVelocity, core.angularVelocity, core.inverseMass, core.inverseInertia, core.body2World, core.maxPenBias, core.maxContactImpulse, nodeIndexArray[i], 
-			core.contactReportThreshold, solverBodyDataPool[i + 1], core.lockFlags, false);
+			core.contactReportThreshold, solverBodyDataPool[i + 1], core.lockFlags);
 		solverBodyPool[i].solverProgress = 0;
 		solverBodyPool[i].maxSolverNormalProgress = 0;
 		solverBodyPool[i].maxSolverFrictionProgress = 0;
@@ -2582,7 +2581,7 @@ static void preIntegrationParallel(
 		core.linearVelocity, core.angularVelocity, !!(rBody.mInternalFlags & PxcRigidBody::eDISABLE_GRAVITY));
 
 	copyToSolverBodyData(core.linearVelocity, core.angularVelocity, core.inverseMass, core.inverseInertia, core.body2World, core.maxPenBias, core.maxContactImpulse, nodeIndexArray[i], 
-		core.contactReportThreshold, solverBodyDataPool[i + 1], core.lockFlags, false);
+		core.contactReportThreshold, solverBodyDataPool[i + 1], core.lockFlags);
 	solverBodyPool[i].solverProgress = 0;
 	solverBodyPool[i].maxSolverNormalProgress = 0;
 	solverBodyPool[i].maxSolverFrictionProgress = 0;
diff --git a/physx/source/lowleveldynamics/src/DyRigidBodyToSolverBody.cpp b/physx/source/lowleveldynamics/src/DyRigidBodyToSolverBody.cpp
index d59d02074..3844fa5bd 100644
--- a/physx/source/lowleveldynamics/src/DyRigidBodyToSolverBody.cpp
+++ b/physx/source/lowleveldynamics/src/DyRigidBodyToSolverBody.cpp
@@ -36,8 +36,7 @@ using namespace physx;
 
 // PT: TODO: SIMDify all this... So far I only took care of the quat-to-matrix transform
 void Dy::copyToSolverBodyData(const PxVec3& linearVelocity, const PxVec3& angularVelocity, const PxReal invMass, const PxVec3& invInertia, const PxTransform& globalPose,
-	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& data, PxU32 lockFlags,
-	bool hasSurfaceVelocity)
+	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& data, PxU32 lockFlags)
 {
 	data.nodeIndex = nodeIndex;
 
@@ -88,8 +87,6 @@ void Dy::copyToSolverBodyData(const PxVec3& linearVelocity, const PxVec3& angula
 	data.maxContactImpulse = maxContactImpulse;
 	data.body2World = globalPose;
 
-	data.hasSurfaceVelocity = PxU16(hasSurfaceVelocity);
-
 	data.lockFlags = PxU16(lockFlags);
 
 	data.reportThreshold = reportThreshold;
diff --git a/physx/source/lowleveldynamics/src/DySolverBody.h b/physx/source/lowleveldynamics/src/DySolverBody.h
index d285b5721..12a9ca045 100644
--- a/physx/source/lowleveldynamics/src/DySolverBody.h
+++ b/physx/source/lowleveldynamics/src/DySolverBody.h
@@ -58,13 +58,12 @@ PX_FORCE_INLINE PxVec3 computeSafeSqrtInertia(const PxVec3& v)
 }
 
 void copyToSolverBodyData(const PxVec3& linearVelocity, const PxVec3& angularVelocity, const PxReal invMass, const PxVec3& invInertia, const PxTransform& globalPose,
-	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& solverBodyData, PxU32 lockFlags,
-	bool hasSurfaceVelocity);
+	const PxReal maxDepenetrationVelocity, const PxReal maxContactImpulse, const PxU32 nodeIndex, const PxReal reportThreshold, PxSolverBodyData& solverBodyData, PxU32 lockFlags);
 
 // PT: TODO: using PxsBodyCore in the interface makes us write less data to the stack for passing arguments, and we can take advantage of the class layout
 // (we know what is aligned or not, we know if it is safe to V4Load vectors, etc). Note that this is what we previously had, which is why PxsBodyCore was still
 // forward-referenced above.
-//void copyToSolverBodyData(PxSolverBodyData& solverBodyData, const PxsBodyCore& core, const PxU32 nodeIndex, bool hasSurfaceVelocity);
+//void copyToSolverBodyData(PxSolverBodyData& solverBodyData, const PxsBodyCore& core, const PxU32 nodeIndex);
 
 }
 
diff --git a/physx/source/lowleveldynamics/src/DyTGSContactPrepBlock.cpp b/physx/source/lowleveldynamics/src/DyTGSContactPrepBlock.cpp
index 589f5193d..ce4340483 100644
--- a/physx/source/lowleveldynamics/src/DyTGSContactPrepBlock.cpp
+++ b/physx/source/lowleveldynamics/src/DyTGSContactPrepBlock.cpp
@@ -1478,7 +1478,7 @@ SolverConstraintPrepState::Enum createFinalizeSolverContacts4Step(
 		invInertiaScale1[a] = blockDesc.mInvMassScales.angular1;
 
 		blockDesc.startFrictionPatchIndex = c.frictionPatchCount;
-		if (!(blockDesc.disableStrongFriction && !blockDesc.body0->hasSurfaceVelocity && !blockDesc.body1->hasSurfaceVelocity))
+		if (!(blockDesc.disableStrongFriction))
 		{
 			bool valid = getFrictionPatches(c, blockDesc.frictionPtr, blockDesc.frictionCount,
 				blockDesc.bodyFrame0, blockDesc.bodyFrame1, correlationDistance);
diff --git a/physx/source/lowleveldynamics/src/DyTGSDynamics.cpp b/physx/source/lowleveldynamics/src/DyTGSDynamics.cpp
index 49e47d6a4..3b4380855 100644
--- a/physx/source/lowleveldynamics/src/DyTGSDynamics.cpp
+++ b/physx/source/lowleveldynamics/src/DyTGSDynamics.cpp
@@ -194,7 +194,6 @@ void copyToSolverBodyDataStep(const PxVec3& linearVelocity, const PxVec3& angula
 	solverVel.lockFlags = PxU16(lockFlags);
 	solverVel.isKinematic = isKinematic;
 	solverVel.maxAngVel = PxSqrt(maxAngVelSq);
-	solverVel.hasSurfaceVelocity = false; //TODO - hook up!
 	solverVel.partitionMask = 0;
 
 	
@@ -232,7 +231,6 @@ void copyToSolverBodyDataStepKinematic(const PxVec3& linearVelocity, const PxVec
 	solverVel.lockFlags = 0;
 	solverVel.isKinematic = true;
 	solverVel.maxAngVel = PxSqrt(maxAngVelSq);
-	solverVel.hasSurfaceVelocity = false; //TODO - hook up!
 	solverVel.partitionMask = 0;
 
 	solverBodyData.nodeIndex = nodeIndex;
@@ -282,7 +280,6 @@ DynamicsTGSContext::DynamicsTGSContext(PxcNpMemBlockPool* memBlockPool,
 	
 	mWorldSolverBodyVel.lockFlags = 0;
 	mWorldSolverBodyVel.isKinematic = false;
-	mWorldSolverBodyVel.hasSurfaceVelocity = false;
 	mWorldSolverBodyTxInertia.sqrtInvInertia = PxMat33(PxZero);
 	mWorldSolverBodyTxInertia.deltaBody2World = PxTransform(PxIdentity);
 	
diff --git a/physx/source/lowleveldynamics/src/DyThreadContext.h b/physx/source/lowleveldynamics/src/DyThreadContext.h
index 0c497c81d..a2327825b 100644
--- a/physx/source/lowleveldynamics/src/DyThreadContext.h
+++ b/physx/source/lowleveldynamics/src/DyThreadContext.h
@@ -67,7 +67,7 @@ namespace Dy
 		PxVec3			deltaLinDt;				//60
 		PxU16			lockFlags;				//62
 		bool			isKinematic;			//63
-		bool			hasSurfaceVelocity;		//64
+		PxU8			pad;					//64
 
 		PX_FORCE_INLINE PxReal projectVelocity(const PxVec3& lin, const PxVec3& ang)	const
 		{
diff --git a/physx/source/physx/src/NpArticulationReducedCoordinate.cpp b/physx/source/physx/src/NpArticulationReducedCoordinate.cpp
index 85428c4a8..0fad29d2e 100644
--- a/physx/source/physx/src/NpArticulationReducedCoordinate.cpp
+++ b/physx/source/physx/src/NpArticulationReducedCoordinate.cpp
@@ -66,6 +66,19 @@ namespace physx
 		mImpl.getArticulation().setArticulationFlags(flags);
 	}
 
+	void NpArticulationReducedCoordinate::setArticulationFlag(PxArticulationFlag::Enum flag, bool value)
+	{
+		NP_WRITE_CHECK(mImpl.getOwnerScene());
+		PxArticulationFlags flags = mImpl.getArticulation().getArticulationFlags();
+
+		if(value)
+			flags |= flag;
+		else
+			flags &= (~flag);
+
+		mImpl.getArticulation().setArticulationFlags(flags);
+	}
+
 	PxArticulationFlags	NpArticulationReducedCoordinate::getArticulationFlags() const
 	{
 		NP_READ_CHECK(mImpl.getOwnerScene());
diff --git a/physx/source/physx/src/NpArticulationReducedCoordinate.h b/physx/source/physx/src/NpArticulationReducedCoordinate.h
index 08250ac60..5ac8c9181 100644
--- a/physx/source/physx/src/NpArticulationReducedCoordinate.h
+++ b/physx/source/physx/src/NpArticulationReducedCoordinate.h
@@ -87,6 +87,8 @@ namespace physx
 		//---------------------------------------------------------------------------------
 		virtual		void						setArticulationFlags(PxArticulationFlags flags);
 
+		virtual		void						setArticulationFlag(PxArticulationFlag::Enum flag, bool value);
+
 		virtual		PxArticulationFlags			getArticulationFlags() const;
 
 		virtual		PxU32						getDofs() const;
diff --git a/physx/source/physx/src/NpRigidDynamic.cpp b/physx/source/physx/src/NpRigidDynamic.cpp
index 6651dde58..0be295f9a 100644
--- a/physx/source/physx/src/NpRigidDynamic.cpp
+++ b/physx/source/physx/src/NpRigidDynamic.cpp
@@ -151,39 +151,6 @@ bool NpRigidDynamic::getKinematicTarget(PxTransform& target) const
 	return false;
 }
 
-void NpRigidDynamic::setKinematicSurfaceVelocity(const PxVec3& linearVelocity, const PxVec3& angularVelocity)
-{
-	NP_WRITE_CHECK(NpActor::getOwnerScene(*this));
-
-	Scb::Body& b = getScbBodyFast();
-
-#if PX_CHECKED
-	NpScene* scene = NpActor::getAPIScene(*this);
-	PX_CHECK_AND_RETURN((b.getFlags() & PxRigidBodyFlag::eKINEMATIC), "PxRigidDynamic::setKinematicSurfaceVelocity: Body must be kinematic!");
-	PX_CHECK_AND_RETURN(scene, "PxRigidDynamic::setKinematicSurfaceVelocity: Body must be in a scene!");
-	PX_CHECK_AND_RETURN(!(b.getActorFlags() & PxActorFlag::eDISABLE_SIMULATION), "PxRigidDynamic::setKinematicSurfaceVelocity: Not allowed if PxActorFlag::eDISABLE_SIMULATION is set!");
-#endif
-
-	b.setKinematicSurfaceVelocity(linearVelocity, angularVelocity);
-}
-
-void NpRigidDynamic::clearKinematicSurfaceVelocity()
-{
-	NP_WRITE_CHECK(NpActor::getOwnerScene(*this));
-
-	Scb::Body& b = getScbBodyFast();
-
-#if PX_CHECKED
-	NpScene* scene = NpActor::getAPIScene(*this);
-	PX_CHECK_AND_RETURN((b.getFlags() & PxRigidBodyFlag::eKINEMATIC), "PxRigidDynamic::clearKinematicSurfaceVelocity: Body must be kinematic!");
-	PX_CHECK_AND_RETURN(scene, "PxRigidDynamic::clearKinematicSurfaceVelocity: Body must be in a scene!");
-	PX_CHECK_AND_RETURN(!(b.getActorFlags() & PxActorFlag::eDISABLE_SIMULATION), "PxRigidDynamic::clearKinematicSurfaceVelocity: Not allowed if PxActorFlag::eDISABLE_SIMULATION is set!");
-#endif
-
-	b.clearKinematicSurfaceVelocity();
-}
-
-
 void NpRigidDynamic::setCMassLocalPose(const PxTransform& pose)
 {
 	PX_CHECK_AND_RETURN(pose.isSane(), "PxRigidDynamic::setCMassLocalPose pose is not valid.");
diff --git a/physx/source/physx/src/NpRigidDynamic.h b/physx/source/physx/src/NpRigidDynamic.h
index e5cfd4d6e..d6714a164 100644
--- a/physx/source/physx/src/NpRigidDynamic.h
+++ b/physx/source/physx/src/NpRigidDynamic.h
@@ -89,10 +89,6 @@ class NpRigidDynamic : public NpRigidDynamicT
 	virtual		void				setKinematicTarget(const PxTransform& destination);
 	virtual		bool				getKinematicTarget(PxTransform& target)	const;
 
-	virtual		void				setKinematicSurfaceVelocity(const PxVec3& linearVelocity, const PxVec3& angularVelocity);
-
-	virtual		void				clearKinematicSurfaceVelocity();
-
 	// Center of mass pose
 	virtual		void				setCMassLocalPose(const PxTransform&);
 
diff --git a/physx/source/physx/src/buffering/ScbBody.h b/physx/source/physx/src/buffering/ScbBody.h
index 385c00c56..b63dcbee2 100644
--- a/physx/source/physx/src/buffering/ScbBody.h
+++ b/physx/source/physx/src/buffering/ScbBody.h
@@ -261,9 +261,6 @@ class Body : public Scb::RigidObject
 	PX_INLINE		void				syncState();
 	PX_INLINE		void				syncCollisionWriteThroughState();
 
-	void								setKinematicSurfaceVelocity(const PxVec3& linearVelocity, const PxVec3& angularVelocity);
-	void								clearKinematicSurfaceVelocity();
-
 	static size_t getScOffset()	{ return reinterpret_cast<size_t>(&reinterpret_cast<Body*>(0)->mBodyCore);	}
 	
 	/**
@@ -694,48 +691,6 @@ PX_INLINE void Body::setKinematicTarget(const PxTransform& p)
 #endif
 }
 
-
-PX_INLINE void Body::setKinematicSurfaceVelocity(const PxVec3& linearVelocity, const PxVec3& angularVelocity)
-{
-	Scene* scene = getScbScene();
-	PX_ASSERT(scene);  // only allowed for an object in a scene
-	PxReal wakeCounterResetValue = scene->getWakeCounterResetValue();
-
-	if (!isBuffering())
-	{
-		mBodyCore.setKinematicSurfaceVelocity(linearVelocity, angularVelocity);
-		mBodyCore.wakeUp(wakeCounterResetValue);
-		setBufferedParamsForAwake(wakeCounterResetValue);
-
-		UPDATE_PVD_PROPERTIES_OBJECT()
-	}
-	else
-	{
-		setLinearVelocity(linearVelocity);
-		setAngularVelocity(angularVelocity);
-
-		wakeUpInternal(wakeCounterResetValue);
-	}
-}
-
-PX_INLINE void Body::clearKinematicSurfaceVelocity()
-{
-	if (!isBuffering())
-	{
-		mBodyCore.invalidateSurfaceVelocity();
-		setBufferedParamsForAsleep();
-		UPDATE_PVD_PROPERTIES_OBJECT()
-	}
-	else
-	{
-		setLinearVelocity(PxVec3(0.0f));
-		setAngularVelocity(PxVec3(0.0f));
-
-		putToSleepInternal();
-	}
-}
-
-
 PX_FORCE_INLINE	void Body::onOriginShift(const PxVec3& shift)
 {
 	mBufferedBody2World.p -= shift;
@@ -870,57 +825,39 @@ PX_INLINE void Body::syncCollisionWriteThroughState()
 {
 	PxU32 bufferFlags = mBodyBufferFlags;
 
-	//- If application set kinematic surface velocity
-	if (bufferFlags & (Buf::BF_LinearVelocity | Buf::BF_AngularVelocity) && (getFlags() & PxRigidBodyFlag::eKINEMATIC))
-	{
-		if (mBufferedLinVelocity.isZero() && mBufferedAngVelocity.isZero())
-		{
-			if(mBodyCore.getSimStateData(true))
-				mBodyCore.invalidateKinematicTarget();
-		}
-		else
-		{
-			mBodyCore.setKinematicSurfaceVelocity(mBufferedLinVelocity, mBufferedAngVelocity);
-		}
-		bufferFlags &= ~(Buf::BF_AngularVelocity | Buf::BF_LinearVelocity);
-	}
+	//----
+	if ((bufferFlags & Buf::BF_LinearVelocity) == 0)
+		mBufferedLinVelocity = mBodyCore.getLinearVelocity();
 	else
 	{
+		PX_ASSERT((mBufferedIsSleeping && mBufferedLinVelocity.isZero()) ||
+			(!mBufferedIsSleeping) ||
+			(getControlState() == ControlState::eREMOVE_PENDING));
+		PX_ASSERT(mBufferedLinVelocity.isZero() ||
+			((!mBufferedLinVelocity.isZero()) && (!mBufferedIsSleeping)) ||
+			(getControlState() == ControlState::eREMOVE_PENDING));
+
+		mBodyCore.setLinearVelocity(mBufferedLinVelocity);
+		//clear the flag
+		bufferFlags &= ~Buf::BF_LinearVelocity;
+	}
 
-		//----
-		if ((bufferFlags & Buf::BF_LinearVelocity) == 0)
-			mBufferedLinVelocity = mBodyCore.getLinearVelocity();
-		else
-		{
-			PX_ASSERT((mBufferedIsSleeping && mBufferedLinVelocity.isZero()) ||
-				(!mBufferedIsSleeping) ||
-				(getControlState() == ControlState::eREMOVE_PENDING));
-			PX_ASSERT(mBufferedLinVelocity.isZero() ||
-				((!mBufferedLinVelocity.isZero()) && (!mBufferedIsSleeping)) ||
-				(getControlState() == ControlState::eREMOVE_PENDING));
-
-			mBodyCore.setLinearVelocity(mBufferedLinVelocity);
-			//clear the flag
-			bufferFlags &= ~Buf::BF_LinearVelocity;
-		}
-
-		//----
+	//----
 
-		if ((bufferFlags & Buf::BF_AngularVelocity) == 0)
-			mBufferedAngVelocity = mBodyCore.getAngularVelocity();
-		else
-		{
-			PX_ASSERT((mBufferedIsSleeping && mBufferedAngVelocity.isZero()) ||
-				(!mBufferedIsSleeping) ||
-				(getControlState() == ControlState::eREMOVE_PENDING));
-			PX_ASSERT(mBufferedAngVelocity.isZero() ||
-				((!mBufferedAngVelocity.isZero()) && (!mBufferedIsSleeping)) ||
-				(getControlState() == ControlState::eREMOVE_PENDING));
-
-			mBodyCore.setAngularVelocity(mBufferedAngVelocity);
-			//clear the flag
-			bufferFlags &= ~Buf::BF_AngularVelocity;
-		}
+	if ((bufferFlags & Buf::BF_AngularVelocity) == 0)
+		mBufferedAngVelocity = mBodyCore.getAngularVelocity();
+	else
+	{
+		PX_ASSERT((mBufferedIsSleeping && mBufferedAngVelocity.isZero()) ||
+			(!mBufferedIsSleeping) ||
+			(getControlState() == ControlState::eREMOVE_PENDING));
+		PX_ASSERT(mBufferedAngVelocity.isZero() ||
+			((!mBufferedAngVelocity.isZero()) && (!mBufferedIsSleeping)) ||
+			(getControlState() == ControlState::eREMOVE_PENDING));
+
+		mBodyCore.setAngularVelocity(mBufferedAngVelocity);
+		//clear the flag
+		bufferFlags &= ~Buf::BF_AngularVelocity;
 	}
 
 	//----
diff --git a/physx/source/physxmetadata/core/include/PxAutoGeneratedMetaDataObjects.h b/physx/source/physxmetadata/core/include/PxAutoGeneratedMetaDataObjects.h
index 9688f7a4e..39a12b32a 100644
--- a/physx/source/physxmetadata/core/include/PxAutoGeneratedMetaDataObjects.h
+++ b/physx/source/physxmetadata/core/include/PxAutoGeneratedMetaDataObjects.h
@@ -742,10 +742,8 @@ template<> struct PxEnumTraits< physx::PxRigidDynamicLockFlag::Enum > { PxEnumTr
 	};
 
 	static PxU32ToName g_physx__PxArticulationJointDriveType__EnumConversion[] = {
-		{ "eNONE", static_cast<PxU32>( physx::PxArticulationJointDriveType::eNONE ) },
 		{ "eTARGET", static_cast<PxU32>( physx::PxArticulationJointDriveType::eTARGET ) },
 		{ "eERROR", static_cast<PxU32>( physx::PxArticulationJointDriveType::eERROR ) },
-		{ "eTARGETVELOCITY", static_cast<PxU32>( physx::PxArticulationJointDriveType::eTARGETVELOCITY ) },
 		{ NULL, 0 }
 	};
 
diff --git a/physx/source/simulationcontroller/include/ScBodyCore.h b/physx/source/simulationcontroller/include/ScBodyCore.h
index 34bcab1dd..c8c4f0842 100644
--- a/physx/source/simulationcontroller/include/ScBodyCore.h
+++ b/physx/source/simulationcontroller/include/ScBodyCore.h
@@ -155,8 +155,6 @@ namespace Sc
 						bool				getHasValidKinematicTarget() const;
 						void				setKinematicTarget(Ps::Pool<SimStateData>* simStateDataPool, const PxTransform& p, PxReal wakeCounter);
 						void				invalidateKinematicTarget();
-						void				setKinematicSurfaceVelocity(const PxVec3& linearVelocity, const PxVec3& angularVelocity);
-						void				invalidateSurfaceVelocity();
 						
 
 		PX_FORCE_INLINE	PxReal				getContactReportThreshold()	const	{ return mCore.contactReportThreshold;	}
diff --git a/physx/source/simulationcontroller/src/ScBodyCore.cpp b/physx/source/simulationcontroller/src/ScBodyCore.cpp
index aed94d818..536730b26 100644
--- a/physx/source/simulationcontroller/src/ScBodyCore.cpp
+++ b/physx/source/simulationcontroller/src/ScBodyCore.cpp
@@ -519,41 +519,6 @@ void Sc::BodyCore::putToSleep()
 		sim->putToSleep();
 }
 
-void Sc::BodyCore::setKinematicSurfaceVelocity(const PxVec3& linearVelocity, const PxVec3& angularVelocity)
-{
-	if (mSimStateData)
-	{
-		invalidateKinematicTarget();
-	}
-
-	mCore.linearVelocity = linearVelocity;
-	mCore.angularVelocity = angularVelocity;
-
-	BodySim* sim = getSim();
-	if (sim)
-	{
-		//update bodysim
-		Sc::Scene& scene = sim->getScene();
-		scene.getSimulationController()->updateDynamic(sim->isArticulationLink(), sim->getNodeIndex());
-		sim->postSetKinematicSurfaceVelocity();
-	}
-}
-
-void Sc::BodyCore::invalidateSurfaceVelocity()
-{
-	mCore.linearVelocity = PxVec3(0.0f);
-	mCore.angularVelocity = PxVec3(0.0f);
-
-	BodySim* sim = getSim();
-
-	if (sim)
-	{
-		Sc::Scene& scene = sim->getScene();
-		scene.getSimulationController()->updateDynamic(sim->isArticulationLink(), sim->getNodeIndex());
-		sim->postClearKinematicSurfaceVelocity();
-	}
-}
-
 void Sc::BodyCore::disableInternalCaching(bool disable)
 {
 	PX_ASSERT(!mSimStateData || mSimStateData->isKine());
diff --git a/physx/source/simulationcontroller/src/ScBodySim.cpp b/physx/source/simulationcontroller/src/ScBodySim.cpp
index 81da8f676..796a9e313 100644
--- a/physx/source/simulationcontroller/src/ScBodySim.cpp
+++ b/physx/source/simulationcontroller/src/ScBodySim.cpp
@@ -318,23 +318,6 @@ void BodySim::postSetKinematicTarget()
 
 	raiseInternalFlag(BF_KINEMATIC_MOVED);	// Important to set this here already because trigger interactions need to have this information when being activated.
 	clearInternalFlag(BF_KINEMATIC_SURFACE_VELOCITY);
-	mLLBody.mInternalFlags &= (~PxsRigidBody::eHAS_SURFACE_VELOCITY);
-}
-
-void BodySim::postSetKinematicSurfaceVelocity()
-{
-	PX_ASSERT(getBodyCore().getSimStateData(true));
-	PX_ASSERT(getBodyCore().getSimStateData(true)->isKine());
-	PX_ASSERT(!getBodyCore().getSimStateData(true)->getKinematicData()->targetValid);
-	clearInternalFlag(BF_KINEMATIC_MOVE_FLAGS);
-	raiseInternalFlag(BF_KINEMATIC_SURFACE_VELOCITY);
-	mLLBody.mInternalFlags |= PxsRigidBody::eHAS_SURFACE_VELOCITY;
-}
-
-void BodySim::postClearKinematicSurfaceVelocity()
-{
-	clearInternalFlag(BF_KINEMATIC_SURFACE_VELOCITY);
-	mLLBody.mInternalFlags &= (~PxsRigidBody::eHAS_SURFACE_VELOCITY);
 }
 
 static void updateBPGroup(ElementSim* current)
@@ -364,9 +347,6 @@ void BodySim::postSwitchToKinematic()
 
 void BodySim::postSwitchToDynamic()
 {
-	clearInternalFlag(BF_KINEMATIC_SURFACE_VELOCITY);
-	mLLBody.mInternalFlags &= (~PxsRigidBody::eHAS_SURFACE_VELOCITY);
-
 	mScene.getSimpleIslandManager()->setDynamic(mNodeIndex);
 
 	setForcesToDefaults(true);
diff --git a/physx/source/simulationcontroller/src/ScBodySim.h b/physx/source/simulationcontroller/src/ScBodySim.h
index eca7b0ee6..005779cf1 100644
--- a/physx/source/simulationcontroller/src/ScBodySim.h
+++ b/physx/source/simulationcontroller/src/ScBodySim.h
@@ -107,8 +107,6 @@ namespace Sc
 						void					postBody2WorldChange();
 						void					postSetWakeCounter(PxReal t, bool forceWakeUp);
 						void					postSetKinematicTarget();
-						void					postSetKinematicSurfaceVelocity();
-						void					postClearKinematicSurfaceVelocity();
 						void					postSwitchToKinematic();
 						void					postSwitchToDynamic();
 						void					postPosePreviewChange(const PxU32 posePreviewFlag);  // called when PxRigidBodyFlag::eENABLE_POSE_INTEGRATION_PREVIEW changes
diff --git a/physx/tools/binarymetadata/windows/win32.metaData b/physx/tools/binarymetadata/windows/win32.metaData
index d7fa76475..a1426697d 100644
Binary files a/physx/tools/binarymetadata/windows/win32.metaData and b/physx/tools/binarymetadata/windows/win32.metaData differ
diff --git a/physx/tools/binarymetadata/windows/win64.metaData b/physx/tools/binarymetadata/windows/win64.metaData
index 395014e63..5835ec7b4 100644
Binary files a/physx/tools/binarymetadata/windows/win64.metaData and b/physx/tools/binarymetadata/windows/win64.metaData differ
diff --git a/physx/tools/physxmetadatagenerator/generateMetaData.py b/physx/tools/physxmetadatagenerator/generateMetaData.py
index d5c7e75e1..86e763d75 100644
--- a/physx/tools/physxmetadatagenerator/generateMetaData.py
+++ b/physx/tools/physxmetadatagenerator/generateMetaData.py
@@ -124,9 +124,9 @@ def includeString(path):
 # find SDK_ROOT, EXTERNALS and PX_SHARED
 sdkRoot = utils.find_root_path(scriptDir, "source")
 
-if os.path.isdir(os.path.join(sdkRoot, "../PhysX_4.0")):
-	externalsRoot = os.path.join(sdkRoot, "../Externals")
-	pxSharedRoot = os.path.join(sdkRoot, "../PxShared")
+if os.path.isdir(os.path.join(sdkRoot, "../physx")):
+	externalsRoot = os.path.join(sdkRoot, "../externals")
+	pxSharedRoot = os.path.join(sdkRoot, "../pxshared")
 else:
 	externalsRoot = os.path.join(utils.find_root_path(scriptDir, os.path.normpath("externals/clang")), "externals")
 	pxSharedRoot = os.path.normpath(os.environ['PM_PxShared_PATH'])
diff --git a/physx/tools/physxmetadatagenerator/generateMetaData.sh b/physx/tools/physxmetadatagenerator/generateMetaData.sh
old mode 100644
new mode 100755
index 829015bd8..e63f2dd65
--- a/physx/tools/physxmetadatagenerator/generateMetaData.sh
+++ b/physx/tools/physxmetadatagenerator/generateMetaData.sh
@@ -29,11 +29,17 @@
 
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 PHYSX_ROOT_DIR=$SCRIPT_DIR/../..
-source $PHYSX_ROOT_DIR/buildtools/update_packman.sh
 
 export GEN_META_DATA_PARAMETER=$1
 
-$PHYSX_ROOT_DIR/buildtools/packman/packman pull "$PHYSX_ROOT_DIR/dependencies.xml" --platform linux --include-tag=requiredForMetaGen --postscript $SCRIPT_DIR/helper.sh
+if [ -f $PHYSX_ROOT_DIR/buildtools/update_packman.sh ] ; then
+    source $PHYSX_ROOT_DIR/buildtools/update_packman.sh
+    $PHYSX_ROOT_DIR/buildtools/packman/packman pull "$PHYSX_ROOT_DIR/dependencies.xml" --platform linux --include-tag=requiredForMetaGen --postscript $SCRIPT_DIR/helper.sh
+else
+    export PM_PxShared_PATH="$PHYSX_ROOT_DIR/../pxshared"
+    $SCRIPT_DIR/helper.sh
+fi
+
 status=$? 
 if [ "$status" -ne "0" ]; then
  echo "Error $status"
diff --git a/physx/tools/physxmetadatagenerator/helper.sh b/physx/tools/physxmetadatagenerator/helper.sh
old mode 100644
new mode 100755